In [20]:
''
%matplotlib inline
import matplotlib.pyplot as plt

from sklearn.metrics import accuracy_score, recall_score, precision_score, fbeta_score, make_scorer, roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV, ShuffleSplit, learning_curve, validation_curve
from sklearn.feature_selection import RFE
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import label_binarize, LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

import seaborn as sns
import pandas as pd
import numpy as np


data = pd.read_excel('dress_sales.xlsx')
data.drop(['Dress_ID'], axis=1, inplace=True)

df = pd.get_dummies(data)
X = df.drop(['Recommendation'], axis=1)
y = df['Recommendation']



In [21]:
#train test split
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state = 0, test_size=0.2)

In [22]:
# Cross validation
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=0)

C = np.arange(1, 10)

# # LEARNING CURVE SCORE
# # Create three different models based on max_depth
for k, C in enumerate(C):
    # Create a Decision tree regressor at max_depth = depth
    regressor = LogisticRegression(C=C)

    # Calculate the training and testing scores
    sizes, train_scores, test_scores = learning_curve(
        regressor, X, y, cv=cv, n_jobs=4,
        scoring=make_scorer(accuracy_score)
    )

    print('C:', C)
    print('score train:', np.mean(train_scores))
    print('score test:', np.mean(test_scores))
    

C: 1
score train: 0.846498827875
score test: 0.580407858981
C: 2
score train: 0.864897118494
score test: 0.572811004497
C: 3
score train: 0.874291301086
score test: 0.573621431835
C: 4
score train: 0.881127906404
score test: 0.572428636702
C: 5
score train: 0.883295577952
score test: 0.570824615829
C: 6
score train: 0.885864756611
score test: 0.574037467234
C: 7
score train: 0.889309617153
score test: 0.572039054421
C: 8
score train: 0.890791312344
score test: 0.57204145925
C: 9
score train: 0.892937994546
score test: 0.574443883318


In [16]:
C = np.arange(1, 10)
regressor = LogisticRegression()
train_scores, test_scores = validation_curve(
    regressor, X, y, cv=cv, param_name='C', param_range=C,
    scoring=make_scorer(accuracy_score)
)

print('\n')
# For each depth
for x, k in enumerate(train_scores):
    print('C', x + 1)
    print('score train:', np.mean(train_scores[x]))
    print('score test:', np.mean(test_scores[x]))



C 1
score train: 0.768016519514
score test: 0.602072962509
C 2
score train: 0.788998579418
score test: 0.590072866316
C 3
score train: 0.795999592407
score test: 0.592104946733
C 4
score train: 0.800992609376
score test: 0.598092970685
C 5
score train: 0.801987616359
score test: 0.594100954717
C 6
score train: 0.806986627346
score test: 0.586104898636
C 7
score train: 0.808994623366
score test: 0.580116874684
C 8
score train: 0.813993634353
score test: 0.582124906813
C 9
score train: 0.815995636355
score test: 0.586116922781


In [18]:
estimator = LogisticRegression()
grid = GridSearchCV(
    estimator, param_grid={'C': [1, 2, 4, 3, 5], 'penalty': ['l1', 'l2']},
    scoring='accuracy', cv=cv
)

grid.fit(X_train, y_train)
best_estimator = grid.best_estimator_
y_pred = best_estimator.predict(X_test)

print('Accuracy:', accuracy_score(y_test, y_pred))
confusion_matrix(y_test, y_pred)

df.head()

Accuracy: 0.53


Unnamed: 0,Rating,Recommendation,Style_Brief,Style_Casual,Style_Flare,Style_Novelty,Style_OL,Style_Sexy,Style_bohemian,Style_cute,...,Pattern Type_leapord,Pattern Type_leopard,Pattern Type_none,Pattern Type_null,Pattern Type_patchwork,Pattern Type_plaid,Pattern Type_print,Pattern Type_solid,Pattern Type_splice,Pattern Type_striped
0,4.6,1,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0.0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,4.6,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,4.5,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
