In [None]:
# What I did:

'''
- NA
- dupes
- SMOTE
- log reg
- feature importance
- collapse quality scores that have minimal samples in training sets (5 or less)
'''

In [1]:
# IMPORTS
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE

In [2]:
# BASE MODEL SCORE: LOG REGRESSION
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

RANDOM_STATE = 42

raw_df = pd.read_csv('/Users/ethanmc/git-repos/kaggle/challenge-09162024/kaggle/input/winequality-red.csv')

X = raw_df.drop('quality', axis=1)
y = raw_df['quality']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = RANDOM_STATE)

regr_model = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000, random_state=RANDOM_STATE)

regr_model.fit(X_train, y_train)

y_pred = regr_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.56

Classification Report:
               precision    recall  f1-score   support

           3       0.00      0.00      0.00         2
           4       0.00      0.00      0.00        19
           5       0.63      0.75      0.68       217
           6       0.51      0.59      0.55       213
           7       0.40      0.14      0.21        70
           8       0.00      0.00      0.00         7

    accuracy                           0.56       528
   macro avg       0.26      0.25      0.24       528
weighted avg       0.52      0.56      0.53       528



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [3]:
# Data improvements and feature selection
RANDOM_STATE = 42
raw_df = pd.read_csv('/Users/ethanmc/git-repos/kaggle/challenge-09162024/kaggle/input/winequality-red.csv')
raw_df = raw_df.dropna()
raw_df = raw_df.drop_duplicates()

X = raw_df.drop('quality', axis=1)
y = raw_df['quality']
y = y.replace({3: 3, 4: 3})
y = y.replace({7: 8, 8: 8})

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 42)

# Multi-class smote tutorial: https://machinelearningmastery.com/multi-class-imbalanced-classification/

strategy = {3:100, 8:200}

oversample = SMOTE(sampling_strategy=strategy)
X_train, y_train = oversample.fit_resample(X_train, y_train)

regr_model = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000, random_state=42)

regr_model.fit(X_train, y_train)

# Get feature importance scores
coeffs = regr_model.coef_[0]

importance_df = pd.DataFrame({
    'feature': X_train.columns,
    'importance': coeffs 
})
importance_df['abs_importance'] = importance_df['importance'].abs()
importance_df = importance_df.sort_values(by='abs_importance', ascending=False)

# removing all the collinear features (free sulfur + total sulfur, fixed acidity + density)
important_features_df = importance_df[importance_df['abs_importance'] > 0.08]

important_features = important_features_df['feature']

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [4]:
# Use our selected features and retrain the model
accuracies = []
for i in range(0,50):
    raw_df = pd.read_csv('/Users/ethanmc/git-repos/kaggle/challenge-09162024/kaggle/input/winequality-red.csv')
    raw_df = raw_df.dropna()
    raw_df = raw_df.drop_duplicates()

    X = raw_df.drop('quality', axis=1)
    y = raw_df['quality']
    y = y.replace({3: 3, 4: 3})
    y = y.replace({7: 8, 8: 8})

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 42)

    X_train = X_train[important_features]
    X_test = X_test[important_features]

    # Multi-class smote tutorial: https://machinelearningmastery.com/multi-class-imbalanced-classification/

    strategy = {3:100, 8:200}

    oversample = SMOTE(sampling_strategy=strategy)
    X_train, y_train = oversample.fit_resample(X_train, y_train)

    regr_model = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000, random_state=42)

    regr_model.fit(X_train, y_train)

    y_pred = regr_model.predict(X_test)

    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    #print(f"Accuracy: {accuracy:.2f}")
    #print("\nClassification Report:\n", classification_report(y_test, y_pred))
    accuracies.append(accuracy)

accuracy_df = pd.DataFrame(accuracies, columns=['accuracy'])
accuracy_df.describe()



Unnamed: 0,accuracy
count,50.0
mean,0.59265
std,0.004681
min,0.579065
25%,0.5902
50%,0.592428
75%,0.596325
max,0.601336


In [5]:
''' 
So maybe the fact that I'm getting a better score even including low importance features is because of overfitting?

One interesting thing is my precision scores across classes is much better.
'''

" \nSo maybe the fact that I'm getting a better score even including low importance features is because of overfitting?\n\nOne interesting thing is my precision scores across classes is much better.\n"