In [1]:
import pandas as pd
import numpy as np

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


## Dataset

In [2]:
df = pd.read_csv("apple_quality.csv")
display(df.head(2), df.sample(2), df.tail(2))

Unnamed: 0,A_id,Size,Weight,Sweetness,Crunchiness,Juiciness,Ripeness,Acidity,Quality
0,0.0,-3.970049,-2.512336,5.34633,-1.012009,1.8449,0.32984,-0.491590483,good
1,1.0,-1.195217,-2.839257,3.664059,1.588232,0.853286,0.86753,-0.722809367,good


Unnamed: 0,A_id,Size,Weight,Sweetness,Crunchiness,Juiciness,Ripeness,Acidity,Quality
1829,1829.0,-4.090033,1.77797,-0.556282,0.128194,-0.06822,-3.114148,0.473929765,good
3,3.0,-0.657196,-2.271627,1.324874,-0.097875,3.63797,-3.413761,0.790723217,good


Unnamed: 0,A_id,Size,Weight,Sweetness,Crunchiness,Juiciness,Ripeness,Acidity,Quality
3999,3999.0,0.27854,-1.715505,0.121217,-1.154075,1.266677,-0.776571,1.599796456,good
4000,,,,,,,,Created_by_Nidula_Elgiriyewithana,


In [3]:
df = df[:-1].drop(columns={"A_id"})
df.head()

Unnamed: 0,Size,Weight,Sweetness,Crunchiness,Juiciness,Ripeness,Acidity,Quality
0,-3.970049,-2.512336,5.34633,-1.012009,1.8449,0.32984,-0.491590483,good
1,-1.195217,-2.839257,3.664059,1.588232,0.853286,0.86753,-0.722809367,good
2,-0.292024,-1.351282,-1.738429,-0.342616,2.838636,-0.038033,2.621636473,bad
3,-0.657196,-2.271627,1.324874,-0.097875,3.63797,-3.413761,0.790723217,good
4,1.364217,-1.296612,-0.384658,-0.553006,3.030874,-1.303849,0.501984036,good


In [4]:
# Dataset Information
display(df.describe(),
pd.DataFrame({
    'feature': df.columns.values,
    'dtypes': [df[col].dtype for col in df.columns],
    'n_unique': df.nunique().values,
    'n_nan': [df[col].isna().sum() for col in df.columns],
    'n_dupe': [df.duplicated().sum() for row in df.columns],
    'sample_unique': [df[col].unique() for col in df.columns]
    })
)
print(f'''
      Total Columns that having missing value = {df.isnull().any().sum()}
      Total Columns that are clean            = {df.shape[1] - df.isnull().any().sum()}
      Total Rows that having missing value    = {df.isnull().sum().sum()}
      Total Rows                              = {df.shape[0]}
      Totol Percentage of missing value      = {(df.isnull().sum().sum()/np.product(df.shape)) * 100}
      ''')

Unnamed: 0,Size,Weight,Sweetness,Crunchiness,Juiciness,Ripeness
count,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0
mean,-0.503015,-0.989547,-0.470479,0.985478,0.512118,0.498277
std,1.928059,1.602507,1.943441,1.402757,1.930286,1.874427
min,-7.151703,-7.149848,-6.894485,-6.055058,-5.961897,-5.864599
25%,-1.816765,-2.01177,-1.738425,0.062764,-0.801286,-0.771677
50%,-0.513703,-0.984736,-0.504758,0.998249,0.534219,0.503445
75%,0.805526,0.030976,0.801922,1.894234,1.835976,1.766212
max,6.406367,5.790714,6.374916,7.619852,7.364403,7.237837


Unnamed: 0,feature,dtypes,n_unique,n_nan,n_dupe,sample_unique
0,Size,float64,4000,0,0,"[-3.970048523, -1.195217191, -0.292023862, -0...."
1,Weight,float64,4000,0,0,"[-2.512336381, -2.839256528, -1.351281995, -2...."
2,Sweetness,float64,4000,0,0,"[5.346329613, 3.664058758, -1.738429162, 1.324..."
3,Crunchiness,float64,4000,0,0,"[-1.012008712, 1.588232309, -0.342615928, -0.0..."
4,Juiciness,float64,4000,0,0,"[1.844900361, 0.853285795, 2.838635512, 3.6379..."
5,Ripeness,float64,4000,0,0,"[0.329839797, 0.867530082, -0.038033328, -3.41..."
6,Acidity,object,4000,0,0,"[-0.491590483, -0.722809367, 2.621636473, 0.79..."
7,Quality,object,2,0,0,"[good, bad]"



      Total Columns that having missing value = 0
      Total Columns that are clean            = 8
      Total Rows that having missing value    = 0
      Total Rows                              = 4000
      Totol Percentage of missing value      = 0.0
      


In [5]:
# ReType
df["Acidity"] = df['Acidity'].astype(float)

In [6]:
# Outliers
numeric_columns = df.select_dtypes(include=np.number).columns.tolist()
def find_anomalies(data, column_name):
    q1, q3 = data.quantile([0.25, 0.75])
    iqr = q3 - q1
    limit = iqr * 1.5
    bot = q1 - limit
    top = q3 + limit
    outliers_count = ((data < (bot)) | (data > (top))).sum()

    result = pd.DataFrame({
        'Column': [column_name],
        'IQR': [iqr],
        'Lower Bound': [bot],
        'Upper Bound': [top],
        'Outliers': [outliers_count]
    })
    return result

df_outliers = pd.DataFrame(columns=['Column', 'IQR', 'Lower Bound', 'Upper Bound', 'Outliers'])

for column in numeric_columns:
    result = find_anomalies(df[column], column)
    df_outliers = pd.concat([df_outliers, result], ignore_index=True)
    
# Remove Outliers
for i,low,up in zip(df_outliers.Column.unique(),df_outliers["Lower Bound"].unique(),df_outliers["Upper Bound"].unique()):
    df = df[(df[i] >= low) & (df[i] <= up)]

display(df_outliers)
print(f"Numbers of Outliers Removed : {df_outliers['Outliers'].sum()}")

  df_outliers = pd.concat([df_outliers, result], ignore_index=True)


Unnamed: 0,Column,IQR,Lower Bound,Upper Bound,Outliers
0,Size,2.622291,-5.750201,4.738963,22
1,Weight,2.042747,-5.07589,3.095097,54
2,Sweetness,2.540347,-5.548946,4.612442,32
3,Crunchiness,1.83147,-2.68444,4.641439,47
4,Juiciness,2.637262,-4.757179,5.79187,32
5,Ripeness,2.537889,-4.57851,5.573044,24
6,Acidity,2.887917,-5.709299,5.842368,20


Numbers of Outliers Removed : 231


In [None]:
import matplotlib.pyplot as plt
# Good & Bad Sample Comparation
quality_counts = df['Quality'].value_counts()

In [10]:
quality_counts

Quality
bad     1928
good    1862
Name: count, dtype: int64

In [None]:
plt.pie(quality_counts, labels=quality_counts.index, autopct='%1.2f%%',  explode=(0.1, 0.1), startangle=90,
        wedgeprops={'edgecolor': 'black', 'linewidth': 1, 'antialiased': True})

legend_labels = [f'{index}: {count}' for index, count in zip(quality_counts.index, quality_counts)]
plt.legend(legend_labels, title='Quality Counts', loc='upper right')
plt.tight_layout()
plt.show()

In [8]:
# Encode
df['Quality'] = df['Quality'].map({'bad': -1, 'good': 1})
df.sample(5)

Unnamed: 0,Size,Weight,Sweetness,Crunchiness,Juiciness,Ripeness,Acidity,Quality
1600,0.512557,0.22411,0.337204,1.799258,-0.313148,-2.156019,0.667614,1
2876,1.470532,-2.533291,-1.157054,-0.276371,0.887803,0.569622,2.343418,1
2760,0.417092,-0.311706,-0.452094,-0.73794,1.52029,-3.265195,3.856591,-1
272,-3.179262,-0.884673,0.775792,-0.109996,3.81563,0.421346,2.15791,-1
87,0.523551,-0.785293,0.307177,-0.773882,1.220446,-1.329702,1.355485,1


## Train dataset

In [9]:
# Train Test Split
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import MinMaxScaler
X = df.drop("Quality", axis=1)
y = df["Quality"]

scaler = MinMaxScaler(feature_range=(0, 1))
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_scaled, y)

X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)
print(f"""
X_train = {X_train.shape}
X_test = {X_test.shape}""")

ModuleNotFoundError: No module named 'imblearn'

In [None]:
X_train.to_csv("apple_X_train.csv", index=False)
X_test.to_csv("apple_X_test.csv", index=False)
y_train.to_csv("apple_y_train.csv", index=False)
y_test.to_csv("apple_y_test.csv", index=False)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

lr = LogisticRegression(random_state=42)
lr.fit(X_train, y_train)

y_pred = lr.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

In [None]:
w_star = np.insert(lr.coef_, 0, np.array(lr.intercept_))
print(w_star)

### Single feature model

In [None]:
# Train Test Split
from sklearn.model_selection import train_test_split
#from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import MinMaxScaler
X = df.drop("Quality", axis=1)
#X = pd.DataFrame(X[["Size", "Weight"]])
y = df["Quality"]

In [None]:
scaler = MinMaxScaler(feature_range=(0, 1))
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
#smote = SMOTE(random_state=42)
#X_resampled, y_resampled = smote.fit_resample(X_scaled, y)

#X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
print(f"""
X_train = {X_train.shape}
X_test = {X_test.shape}""")

In [None]:
# Import model
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

lr = LogisticRegression(random_state=42)
lr.fit(X_train, y_train)

y_pred = lr.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

In [None]:
w_star = np.insert(lr.coef_, 0, np.array(lr.intercept_))
print(w_star)

In [None]:
X_train["Cost"] = np.ones(X_train.shape[0])
X_train.T

In [None]:
import matplotlib.pyplot as plt
plt.scatter(X_train["Size"].values, y_train.values)
plt.scatter(X_train["Size"].values, 1 / (1 + np.exp(- np.dot(w_star.T, X_train.values.T))))
plt.show()

In [None]:
# Generate a neighbour point
eps = 0.1
w_bar = w_star + np.array([0.5,-0.5,0,0,0,-0.08,0.5,-0.7])
print(np.linalg.norm(w_star - w_bar))
print(w_bar)

In [None]:
from sklearn.metrics import log_loss
log_loss(y_test, y_pred)