# **STINTSY Project**
**S16 Group <#>**

Names:

Adrada, Jasper John

David, Rain Caitlin Aelis

Badiola, Maxine Beatriz

Ogatia, Graham Joshua



# **Load Dataset**

In [None]:
import pandas as pd
file = 'pumpkin_seeds.csv'
df = pd.read_csv(file, encoding='latin1')

display(df.head(10))

# I. Introduction to the problem/task and dataset

# II. Description of the dataset

# III. List of requirements

# IV. Data preprocessing and cleaning

## A. Cleaning the Dataset

Check for duplicates.

In [None]:
duplicates = df.duplicated().sum()
print(f"Duplicate rows: {duplicates}")


Check for null values.

In [None]:
null_val = df.isnull().sum()
print("Missing values in each column:\n",null_val)

Check the unique values in the `Class` column.

In [None]:
print("Class:", df['Class'].unique())

Replace the corrupted characters to fix the encoding issues.

In [None]:
df['Class'] = df['Class'].str.replace('\x82er\x8develik', 'Cercevelik', regex=False)
df['Class'] = df['Class'].str.replace('\x86rg\x9fp Sivrisi', 'Urgup Sivrisi', regex=False)
print("Fixed unique values in 'Class':", df['Class'].unique())


The models we are using are **Decision Trees**, **Logistic Regression**, and **KNN**. These models require numerical inputs therefore we need to encode the `Class` column to numerical values. Since the `Class` column is binary,  **label encoding (0,1)** is sufficient for this project.

In [None]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
df['Class'] = encoder.fit_transform(df['Class'])

print(df['Class'].unique())  # Output: [0, 1]


## B. Check and Handle Outliers

In [None]:
from scipy.stats import zscore
numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns

# Calculate Z-scores for each numerical column
z_scores = df[numerical_cols].apply(zscore)

# Identify outliers (absolute Z-score > 3)
outliers = (z_scores.abs() > 3).sum()
print("Number of outliers in each column:")
print(outliers)


Visually see the outliers.

In [None]:
import matplotlib.pyplot as plt

for col in numerical_cols:
    plt.figure(figsize=(8, 4))
    plt.boxplot(df[col], vert=False, patch_artist=True)
    plt.title(f"Boxplot for {col}")
    plt.xlabel(col)
    plt.show()

Keep outliers because <explanation\>.

## C. Data Preprocessing

Make a copy of the dataset for each model as they will be preprocessed differently according to the following:
KNN - Normalization
Logistic Regression - Standardization
Decision Trees - Keep it as is

In [None]:
# Logistic Regression dataset
df_logistic = df.copy()

# KNN dataset
df_knn = df.copy()

# Decision Trees dataset
df_tree = df.copy()

### Normalization for KNN model.

Check data before normalization.

In [None]:
print(df_knn.describe())

Normalize data using MinMax Scale

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler_knn = MinMaxScaler()
df_knn[numerical_cols] = scaler_knn.fit_transform(df_knn[numerical_cols])

# Check the normalized data
print(df_knn.describe())


Check normalized dataset.

In [None]:
display(df_knn)

### Standardization for Logistic Regression

Check Data Before Standardization.

In [None]:
print(df_logistic.describe())

Standardize Data Using StandardScaler

In [None]:
from sklearn.preprocessing import StandardScaler

X_logistic = df_logistic.drop('Class', axis=1)  # Features
y_logistic = df_logistic['Class']  # Target

scaler_logistic = StandardScaler()

X_logistic_standardized = scaler_logistic.fit_transform(X_logistic)

X_logistic_standardized = pd.DataFrame(X_logistic_standardized, columns=X_logistic.columns)

df_logistic = X_logistic_standardized.copy()
df_logistic['Class'] = y_logistic

print(df_logistic.describe())

Check standardized dataset.

In [None]:
display(df_logistic)

The Decision Tree model does not require scaled data and can work directly with the original cleaned dataset. Therefore, the final datasets to be used for each model are as follows: `df_knn` for KNN, `df_logistic` for Logistic Regression, and `df_trees` for Decision Trees.

# V. Exploratory data analysis

Display dataset information.

In [None]:
display(df.head())
print(df.info())
print(df.describe())

Analyze the `Class` Variable

In [None]:
print(df['Class'].value_counts())

import seaborn as sns
import matplotlib.pyplot as plt

sns.countplot(data=df, x='Class')
plt.title("Distribution of Target Variable (Class)")
plt.xlabel("Class (0 = Cercevelik, 1 = Urgup Sivrisi)")
plt.ylabel("Count")
plt.show()


Visualize the distribution of numerical features.

In [None]:
# Plot histograms for all numerical features
df.loc[:, df.columns != 'Class'].hist(bins=20, figsize=(20, 15), edgecolor='black')
plt.suptitle("Feature Distributions (Excluding Class)", fontsize=20)
plt.show()

Check correlation between pictures.

In [None]:
# correlation matrix
corr_matrix = df.corr()

#heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title("Correlation Heatmap")
plt.show()


Visualize the relationship between features and the `Class` variable.

In [None]:
# Boxplots for numerical features grouped by 'Class'
for col in df.columns[:-1]:  # Exclude 'Class'
    plt.figure(figsize=(8, 4))
    sns.boxplot(x='Class', y=col, data=df)
    plt.title(f"{col} by Class")
    plt.xlabel("Class (0 = Cercevelik, 1 = Urgup Sivrisi)")
    plt.ylabel(col)
    plt.show()


In [None]:
# pairwise relationships
sns.pairplot(df, hue='Class', diag_kind='kde', corner=True)
plt.suptitle("Pairplot of Features by Class", y=1.02)
plt.show()


# K-Nearest Neighbors

In [206]:
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.ensemble import RandomForestClassifier

In [207]:
X_knn = df.drop('Class', axis=1)
y_knn = df['Class']

Train test split

In [208]:
X_train, X_test, y_train, y_test = train_test_split(X_knn, y_knn, test_size=0.2, random_state=42)

Feature selection

In [209]:
rf = RandomForestClassifier(random_state=42)  
rf.fit(X_train, y_train)

# Get feature importance
threshold = 0.05
importance = rf.feature_importances_
important_features = X_train.columns[importance > threshold]

# Check the feature ranking
indices = np.argsort(importance)[::-1]
print("Feature ranking:")
for rank, idx in enumerate(indices, start=1):
    print(f"{rank}. Feature: {X_train.columns[idx]} (Importance: {importance[idx]:.4f})")

Feature ranking:
1. Feature: Aspect_Ration (Importance: 0.2019)
2. Feature: Compactness (Importance: 0.1671)
3. Feature: Eccentricity (Importance: 0.1522)
4. Feature: Roundness (Importance: 0.1441)
5. Feature: Major_Axis_Length (Importance: 0.0788)
6. Feature: Solidity (Importance: 0.0466)
7. Feature: Minor_Axis_Length (Importance: 0.0425)
8. Feature: Extent (Importance: 0.0389)
9. Feature: Perimeter (Importance: 0.0373)
10. Feature: Equiv_Diameter (Importance: 0.0314)
11. Feature: Area (Importance: 0.0313)
12. Feature: Convex_Area (Importance: 0.0280)


Retain only features above the threshold

In [210]:
print(important_features)
X_train = X_train[important_features]
X_test = X_test[important_features]

Index(['Major_Axis_Length', 'Eccentricity', 'Roundness', 'Aspect_Ration',
       'Compactness'],
      dtype='object')


## A. Initial model training

## B. Error analysis

## C. Improving model performance

## D. Model performance summary

# Logistic Regression

## A. Initial model training

## B. Error analysis

## C. Improving model performance

## D. Model performance summary

REFERENCE: [Logistic Regression: Sklearn\Scikit](https://www.datacamp.com/tutorial/understanding-logistic-regression-python)

In [None]:
display(df.head())
print(df.info())
print(df.describe())

In [None]:
##import logistic regression libraries
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler #not used for now
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [None]:
#logistic regression table copy
df_logistic = df.copy()
X_logistic = df_logistic.drop('Class', axis=1)  #features
y_logistic = df_logistic['Class']  #target

#training set = 70%, test set = 30%
X_train_logistic, X_test_logistic, y_train_logistic, y_test_logistic = train_test_split(
    X_logistic, y_logistic, test_size=0.3, random_state=1
)

model_logistic = LogisticRegression()
model_logistic.fit(X_train_logistic, y_train_logistic)
y_pred_logistic = model_logistic.predict(X_test_logistic)

In [None]:
#summarize results

# Decision Trees

## A. Initial model training

## B. Error analysis

## C. Improving model performance

## D. Model performance summary

# Section 10. Insights and conclusions

# Section 11. Reference