In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv(r"Processed_Miscarriage_Dataset.csv")
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

In [None]:
df.head(100)

In [None]:
shape = df.shape
shape

In [None]:
columns = df.columns

In [None]:
columns

In [None]:
df.duplicated().sum()

In [None]:
df = df.drop_duplicates()

In [None]:
df.isnull().sum()

In [None]:
mode_impute_cols = ['RelationWithHus', 'ChildEverBorn', 'Pre_termi_type', 'Ultra_bfr_preg_termi',
                    'Major_water_src', 'Work_currently', 'Women_curr_job', 'Husband_job', 'Sex_Transmit_Diseases','AgeOf1stMensuration']

for col in mode_impute_cols:
    mode_value = df[col].mode()[0]
    df[col].fillna(mode_value, inplace=True)

In [None]:
!pip install scikit-learn

In [None]:
from sklearn.impute import KNNImputer
median_husband_age = df['HusbandAge'].median()
df['HusbandAge'].fillna(median_husband_age, inplace=True)

mode_relation = df['RelationWithHus'].mode()[0]
df['RelationWithHus'].fillna(mode_relation, inplace=True)
median_age_1st_preg = df['AgeOf1stPreg'].median()
df['AgeOf1stPreg'].fillna(median_age_1st_preg, inplace=True)
median_water_src = df['Major_water_src'].median()
df['Major_water_src'].fillna(median_water_src, inplace=True)

df['Hus_Alcohol_Freq'].fillna('Missing', inplace=True)
df['Physical_Abuse'].fillna('Missing', inplace=True)
df['Wife_Alcohol_Freq'].fillna('Missing', inplace=True)
df.drop(['Reason_termination', 'Chemotherapy'],axis = 1,inplace = True)

In [None]:
df.isnull().sum()

In [None]:
def classify_features(df):
    categorical_features = []
    non_categorical_features = []
    discrete_features = []
    continuous_features = []

    for column in df.columns:
        if df[column].dtype == 'object':
            if df[column].nunique() < 10:
                categorical_features.append(column)
            else:
                non_categorical_features.append(column)
        elif df[column].dtype in ['int64', 'float64']:
            if df[column].nunique() < 10:
                discrete_features.append(column)
            else:
                continuous_features.append(column)

    return categorical_features, non_categorical_features, discrete_features, continuous_features

In [None]:
categorical, non_categorical, discrete, continuous = classify_features(df)

In [None]:
print("Categorical Features:", categorical)
print("Non-Categorical Features:", non_categorical)
print("Discrete Features:", discrete)
print("Continuous Features:", continuous)

In [None]:
for i in discrete:
    print(i)
    print(df[i].unique())
    print()

In [None]:
for i in discrete:
    print(df[i].value_counts())
    print()

In [None]:
for i in discrete:
    plt.figure(figsize=(15, 6))
    ax = sns.countplot(x=i, data=df, palette='hls')

    for p in ax.patches:
        height = p.get_height()
        ax.annotate(f'{height}', 
                    xy=(p.get_x() + p.get_width() / 2., height),
                    xytext=(0, 10),  
                    textcoords='offset points',  
                    ha='center', va='center')  
    
    plt.show()

In [None]:
!pip install plotly

In [None]:
import plotly.express as px

for i in discrete:
    counts = df[i].value_counts()
    fig = px.pie(counts, values=counts.values, names=counts.index, title=f'Distribution of {i}')
    fig.show()

In [None]:
for i in continuous:
    plt.figure(figsize=(15,6))
    sns.histplot(df[i], bins = 20, kde = True, palette='hls')
    plt.xticks(rotation = 90)
    plt.show()

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [None]:
columns = df.columns
for i in columns:
    if df[i].dtype == 'object':
        cat = {}
        cnt = 1
        for j in df[i].unique():
            cat[j] = cnt
            cnt += 1
        df[i] = df[i].map(cat)

In [None]:
df.dtypes

In [None]:
X = df.drop(['Preg_termination'],axis=1)
y = df['Preg_termination'].astype(int)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, stratify = y,random_state=42)

In [None]:
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

In [None]:
y_pred = logreg.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report, roc_curve, precision_recall_curve, auc

In [None]:
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:',accuracy)

precision = precision_score(y_test, y_pred)
print('Precision',precision)

recall = recall_score(y_test, y_pred)
print('Recall',recall)

f1 = f1_score(y_test, y_pred)
print('F1:',f1)

cm = confusion_matrix(y_test, y_pred)
print(cm)

y_prob = logreg.predict_proba(X_test)
print(y_prob)

cr = classification_report(y_test, y_pred)
print(cr)

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

In [None]:
dt_model = DecisionTreeClassifier()
dt_model.fit(X_train, y_train)

y_pred = dt_model.predict(X_test)
print("Decision Tree Accuracy:", accuracy_score(y_test, y_pred))

In [None]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

y_pred = rf_model.predict(X_test)
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred))

In [None]:
svm_model = SVC(kernel='linear')  
svm_model.fit(X_train, y_train)

y_pred = svm_model.predict(X_test)
print("SVM Accuracy:", accuracy_score(y_test, y_pred))

In [None]:
knn_model = KNeighborsClassifier(n_neighbors=5)  
knn_model.fit(X_train, y_train)

y_pred = knn_model.predict(X_test)
print("KNN Accuracy:", accuracy_score(y_test, y_pred))

In [None]:
nb_model = GaussianNB()
nb_model.fit(X_train, y_train)

y_pred = nb_model.predict(X_test)
print("Naïve Bayes Accuracy:", accuracy_score(y_test, y_pred))