In [1]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# Common imports
import numpy as np
import pandas as pd
import os
import cv2

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Ignore useless warnings (see SciPy issue #5998)
import warnings
warnings.filterwarnings(action="ignore", message="^internal gelsd")

In [2]:
CW_DATASET_PATH = "CW_dataset"

def load_data(dataset_path=CW_DATASET_PATH):
    x_train_all_path = os.path.join(dataset_path, "x_train_all.csv")
    y_train_all_path = os.path.join(dataset_path, "y_train_all.csv")
    x_test_all_path = os.path.join(dataset_path, "x_test_all.csv")
    y_test_all_path = os.path.join(dataset_path, "y_test_all.csv")

    x_train_all = pd.read_csv(x_train_all_path)
    y_train_all = pd.read_csv(y_train_all_path)
    x_test_all = pd.read_csv(x_test_all_path)
    y_test_all = pd.read_csv(y_test_all_path)

    return x_train_all, y_train_all,x_test_all,y_test_all

In [3]:
x_train, y_train,x_test, y_test = load_data()

In [4]:
x_train.shape

(9690, 2304)

In [5]:
y_train.shape

(9690, 1)

In [6]:
x_test.shape

(3090, 2304)

In [7]:
y_train = y_train.values.ravel()

# Linear Classifier


In [19]:
from sklearn.preprocessing import StandardScaler
# using solver sag, fast convergence is guaranteed with features with approximately the same scale

scaler = StandardScaler()

x_scaled = scaler.fit_transform(x_train)

x_test_scaled = scaler.fit_transform(x_test)



In [20]:
from sklearn.linear_model import LogisticRegression
# for multi-class problems: lbfgs, sag, saga, newton-cg handle multinomial loss
log_reg = LogisticRegression(solver='saga', max_iter=5000, random_state=42)
log_reg.fit(x_scaled, y_train)


### Cross Validation 

In [10]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
cv_score = cross_val_score(log_reg, x_scaled, y_train, cv=10, scoring='accuracy')

print("Cross Validation scores:", cv_score)
print(f'Mean accuracy: {cv_score.mean()}')
print(f'Standard Deviation: {cv_score.std()}')

Cross Validation scores: [0.78018576 0.89680083 0.89164087 0.90092879 0.8875129  0.8998968
 0.85242518 0.88957688 0.89576883 0.92053664]
Mean accuracy: 0.8815273477812179
Standard Deviation: 0.03740539457797173


In [11]:
print(f'Mean accuracy: {cv_score.mean()}')
print(f'Standard Deviation: {cv_score.std()}')

Mean accuracy: 0.8815273477812179
Standard Deviation: 0.03740539457797173


### Accuracy on train dataset

In [21]:
from sklearn.metrics import accuracy_score

y_pred = log_reg.predict(x_scaled)
accuracy = accuracy_score(y_train, y_pred)
print("accuracy without cv:", accuracy)

accuracy without cv: 0.9964912280701754


### Accuracy on test dataset

In [22]:
y_test_pred = log_reg.predict(x_test_scaled)

accuracy_test = accuracy_score(y_test, y_test_pred)
print(f"accuracy on test set:", {accuracy_test})

accuracy on test set: {0.8792880258899677}


# Linear Regression

In [8]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(x_train, y_train)


In [13]:
from sklearn.metrics import mean_squared_error


In [14]:
liny_train_pred = lin_reg.predict(x_train)
mse_train = mean_squared_error(y_train, liny_train_pred)
print(f"Mean Squared Error on train set:", {mse_train})

Mean on train set: {0.8481028207054563}


In [17]:
liny_test_pred = lin_reg.predict(x_test)

mse_test = mean_squared_error(y_test, liny_test_pred)
print(f"Mean Squared Error on test set:", {mse_test})

Mean Squared Error on test set: {3.155529684535813}
