In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder,OneHotEncoder,StandardScaler,MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn import metrics
from sklearn.pipeline import Pipeline

In [3]:
data=pd.read_csv('mlfolder/CSV_Files/wineData.csv')

In [4]:
data.head()

Unnamed: 0,winery,wine,year,rating,num_reviews,country,region,type,body,acidity,price_range,Unnamed: 11,Unnamed: 12
0,Teso La Monja,Tinto,2013,4.9,58,Espana,Toro,Toro Red,5.0,3.0,[200-3030],1.0,0.0
1,Artadi,Vina El Pison,2018,4.9,31,Espana,Vino de Espana,Tempranillo,4.0,2.0,[200-3030],1.0,
2,Vega Sicilia,Unico,2009,4.8,1793,Espana,Ribera del Duero,Ribera Del Duero Red,5.0,3.0,[200-3030],,
3,Vega Sicilia,Unico,1999,4.8,1705,Espana,Ribera del Duero,Ribera Del Duero Red,5.0,3.0,[200-3030],,
4,Vega Sicilia,Unico,1996,4.8,1309,Espana,Ribera del Duero,Ribera Del Duero Red,5.0,3.0,[200-3030],,


In [5]:
data.shape

(7499, 13)

In [7]:
np.sum(data.isna())

  return reduction(axis=axis, out=out, **passkwargs)


winery            0
wine              0
year              2
rating            0
num_reviews       0
country           0
region            0
type            545
body           1169
acidity        1169
price_range       0
Unnamed: 11    7497
Unnamed: 12    7498
dtype: int64

In [5]:
data['price_range'].value_counts()

price_range
[10-20]       2138
[20-30]       1655
[50-100]      1559
[30-40]        762
[40-50]        611
[200-3030]     272
[0-10]         237
[100-150]      163
[150-200]      102
Name: count, dtype: int64

In [6]:
data=data.drop('country',axis=1)

In [7]:
data.head()

Unnamed: 0,winery,wine,year,rating,num_reviews,region,type,body,acidity,price_range,Unnamed: 11,Unnamed: 12
0,Teso La Monja,Tinto,2013,4.9,58,Toro,Toro Red,5.0,3.0,[200-3030],1.0,0.0
1,Artadi,Vina El Pison,2018,4.9,31,Vino de Espana,Tempranillo,4.0,2.0,[200-3030],1.0,
2,Vega Sicilia,Unico,2009,4.8,1793,Ribera del Duero,Ribera Del Duero Red,5.0,3.0,[200-3030],,
3,Vega Sicilia,Unico,1999,4.8,1705,Ribera del Duero,Ribera Del Duero Red,5.0,3.0,[200-3030],,
4,Vega Sicilia,Unico,1996,4.8,1309,Ribera del Duero,Ribera Del Duero Red,5.0,3.0,[200-3030],,


In [8]:
np.random.seed(10)

In [12]:
numeric_features=['rating','num_reviews']
numeric_transformer=Pipeline(steps=[('imputer',SimpleImputer(strategy='most_frequent')),('scaler',MinMaxScaler())])

categorical_features=['winery','wine','year','region','type','body','acidity']
categorical_transformer=OneHotEncoder(handle_unknown='ignore')

preprocessor=ColumnTransformer(transformers=[
    ('num',numeric_transformer,numeric_features),
    ('cat',categorical_transformer,categorical_features)
])

# clf=Pipeline(steps=[('preprocessor',preprocessor),('')])

In [13]:
target=data['price_range']
X=data.drop('price_range',axis=1)

In [14]:
label=LabelEncoder()
y=label.fit_transform(target)

In [16]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=0,shuffle=True)

In [17]:
clf=Pipeline(steps=[('preprocessor',preprocessor),('classifier',LogisticRegression(solver='liblinear'))])

In [18]:
clf.fit(X_train,y_train)

In [20]:
print('Logistic Regression model score: %.3f'%clf.score(X_test,y_test))

Logistic Regression model score: 0.859


In [21]:
clf2=Pipeline(steps=[('preprocessor',preprocessor),('classifier',SVC(C=1000,kernel='rbf'))])

In [22]:
clf2.fit(X_train,y_train)

In [23]:
print("SVC Model Score: %.3f" % clf2.score(X_test,y_test))

SVC Model Score: 0.876


In [24]:
label.inverse_transform([7])

array(['[40-50]'], dtype=object)

In [26]:
for i in np.unique(y):
    print(i,label.inverse_transform([i]))

0 ['[0-10]']
1 ['[10-20]']
2 ['[100-150]']
3 ['[150-200]']
4 ['[20-30]']
5 ['[200-3030]']
6 ['[30-40]']
7 ['[40-50]']
8 ['[50-100]']


In [27]:
print("SVC",metrics.classification_report(y_test,clf2.predict(X_test)))

SVC               precision    recall  f1-score   support

           0       1.00      0.94      0.97        49
           1       0.98      0.98      0.98       421
           2       0.46      0.38      0.41        32
           3       0.07      0.05      0.06        19
           4       0.91      0.90      0.90       344
           5       0.80      0.80      0.80        66
           6       0.77      0.77      0.77       154
           7       0.82      0.81      0.82       110
           8       0.84      0.90      0.87       305

    accuracy                           0.88      1500
   macro avg       0.74      0.72      0.73      1500
weighted avg       0.87      0.88      0.87      1500



In [29]:
print('LR',metrics.classification_report(y_test,clf.predict(X_test)))

LR               precision    recall  f1-score   support

           0       1.00      0.94      0.97        49
           1       0.94      0.97      0.95       421
           2       0.47      0.25      0.33        32
           3       0.14      0.05      0.08        19
           4       0.92      0.88      0.90       344
           5       0.82      0.76      0.79        66
           6       0.76      0.75      0.75       154
           7       0.87      0.76      0.81       110
           8       0.77      0.90      0.83       305

    accuracy                           0.86      1500
   macro avg       0.74      0.70      0.71      1500
weighted avg       0.85      0.86      0.85      1500



In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

# Load the dataset
data = pd.read_csv("wineData.csv")

# Step a: Delete rows with missing target values
data = data.dropna(subset=['price_range'])

# Separate features and target variable
X = data.drop('price_range', axis=1)
y = data['price_range']

# Identify numeric and categorical columns
numeric_features = ['rating', 'num_reviews']
categorical_features = [col for col in X.columns if col not in numeric_features]

# Step b: Handle missing values
numeric_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler())
    ]
)

categorical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", OneHotEncoder(handle_unknown="ignore"))
    ]
)

# Create the preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

# Step c: Encode the target variable
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Step e: Create train and test data sets with 80-20 split
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Create pipelines for models
logistic_pipeline = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("classifier", LogisticRegression(max_iter=1000))
    ]
)

svc_pipeline = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("classifier", SVC())
    ]
)

# Train Logistic Regression model
logistic_pipeline.fit(X_train, y_train)
logistic_pred = logistic_pipeline.predict(X_test)

# Train Support Vector Classifier model
svc_pipeline.fit(X_train, y_train)
svc_pred = svc_pipeline.predict(X_test)

# Step f: Evaluate the models
logistic_accuracy = accuracy_score(y_test, logistic_pred)
svc_accuracy = accuracy_score(y_test, svc_pred)

print("Logistic Regression Model Accuracy:", logistic_accuracy)
print("Support Vector Classifier Model Accuracy:", svc_accuracy)

print("\nLogistic Regression Classification Report:")
print(classification_report(y_test, logistic_pred))

print("\nSupport Vector Classifier Classification Report:")
print(classification_report(y_test, svc_pred))
