In [99]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MaxAbsScaler, KBinsDiscretizer, Normalizer, MinMaxScaler, OneHotEncoder, \
    OrdinalEncoder, StandardScaler
from sklearn.tree import DecisionTreeClassifier

In [84]:
df = pd.read_csv('MB/trainMobile.csv')
df

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,842,0,2.2,0,1,0,7,0.6,188,2,...,20,756,2549,9,7,19,0,0,1,1
1,1021,1,0.5,1,0,1,53,0.7,136,3,...,905,1988,2631,17,3,7,1,1,0,2
2,563,1,0.5,1,2,1,41,0.9,145,5,...,1263,1716,2603,11,2,9,1,1,0,2
3,615,1,2.5,0,0,0,10,0.8,131,6,...,1216,1786,2769,16,8,11,1,0,0,2
4,1821,1,1.2,0,13,1,44,0.6,141,2,...,1208,1212,1411,8,2,15,1,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,794,1,0.5,1,0,1,2,0.8,106,6,...,1222,1890,668,13,4,19,1,1,0,0
1996,1965,1,2.6,1,0,0,39,0.2,187,4,...,915,1965,2032,11,10,16,1,1,1,2
1997,1911,0,0.9,1,1,1,36,0.7,108,8,...,868,1632,3057,9,1,5,1,1,0,3
1998,1512,0,0.9,0,4,1,46,0.1,145,5,...,336,670,869,18,10,19,1,1,1,0


# 1
Si vuole predire il valore di price_range sulla base degli attributi presenti nel dataset.
Dividere il dataset in modo che 3/4 degli elementi siano contenuti in un nuovo dataset “train” e
1/4 nel dataset “test”.

Allenare il train con il modello Decision Tree e valutare l’accuracy ottenuta calcolata sia sul
dataset train sia sul dataset test. Confrontare i risultati ottenuti con quelli ottenuti con una
predizione basata sul modello Logistic Regression. Effettuare alcune considerazioni sui risultati
ottenuti, tenendo in considerazione anche l’analisi della confusion matrix. (punti 4)

In [85]:
X = df.drop('price_range', axis=1)
y = df['price_range']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
y_pred = dt.predict(X_train)
acc_dt_train = accuracy_score(y_pred, y_train)
y_pred = dt.predict(X_test)
acc_dt_test = accuracy_score(y_pred, y_test)

print('---DECISION TREE---')
print(f'Accuracy train: {acc_dt_train}, Accuracy test: {acc_dt_test}')
print(confusion_matrix(y_test, y_pred))

lr = LogisticRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_train)
acc_lr_train = accuracy_score(y_pred, y_train)
y_pred = lr.predict(X_test)
acc_lr_test = accuracy_score(y_pred, y_test)

print('---LOGISTIC REGRESSION---')
print(f'Accuracy train: {acc_lr_train}, Accuracy test: {acc_lr_test}')
print(confusion_matrix(y_test, y_pred))

---DECISION TREE---
Accuracy train: 1.0, Accuracy test: 0.812
[[110  22   0   0]
 [ 11  96  11   0]
 [  0  16  85  19]
 [  0   0  15 115]]
---LOGISTIC REGRESSION---
Accuracy train: 0.6453333333333333, Accuracy test: 0.616
[[95 36  1  0]
 [24 59 26  9]
 [ 1 23 59 37]
 [ 0  3 32 95]]


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Il dt ha overfitting, ma ha comunque prestazioni migliori del lr, quindi ci sono delle feature che non sono lineari.
Dalle confusion matrix posso vedere che il dt sbaglia su classi adiacenti, mentre il lr è più sporca

# 2
Confrontare l’accuratezza ottenuta nel punto precedente con l’accuratezza si ottiene con un
una 10 Fold cross validation. (punti 1)

In [86]:
cross = cross_val_score(dt, X, y, cv=10, scoring='accuracy', n_jobs=-1)
print(cross)
print(cross.mean())
print(cross.std())

[0.845 0.86  0.85  0.825 0.845 0.83  0.83  0.815 0.87  0.79 ]
0.836
0.022


Non è cambiato molto (0.81 -> 0.83), quindi 3/4 è una buona approssimazione

# 3
Utilizzare la funzione di gridSearchCV per trovare i parametri migliori del classificatore
decision tree. Agire sui parametri criterion, max_features e min_samples_split. Vericare se
l’accuratezza che si ottiene con la nuova configurazione supera quella standard ottenuta al
punto 1 (punti 4)

In [87]:
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_features': ['sqrt', 'log2', None],
    'min_samples_split': [2, 5, 10, 20],
}
gs = GridSearchCV(dt, param_grid=param_grid, cv=5, n_jobs=-1)
gs.fit(X_train, y_train)
y_pred = gs.predict(X_train)
acc_gs_train = accuracy_score(y_pred, y_train)
y_pred = gs.predict(X_test)
acc_gs_test =accuracy_score(y_pred, y_test)
print('---Grid Search---')
print('Accuracy train: {:.2f}%'.format(acc_gs_train*100))
print('Accuracy test: {:.2f}%'.format(acc_gs_test*100))
print(f'Best params: {gs.best_params_}')

if acc_gs_test > acc_dt_test:
    print(f"MIGLIORAMENTO: La nuova configurazione supera quella standard di {(acc_gs_test - acc_dt_test):.4f}")
else:
    print(f"NESSUN MIGLIORAMENTO: La configurazione standard era uguale o migliore (Diff: {(acc_gs_test - acc_dt_test):.4f})")

print("\nConfusion Matrix (Best Model):")
print(confusion_matrix(y_test, y_pred))

---Grid Search---
Accuracy train: 100.00%
Accuracy test: 84.00%
Best params: {'criterion': 'entropy', 'max_features': None, 'min_samples_split': 2}
MIGLIORAMENTO: La nuova configurazione supera quella standard di 0.0280

Confusion Matrix (Best Model):
[[119  13   0   0]
 [ 12  97   9   0]
 [  0  13  89  18]
 [  0   0  15 115]]


# 4
Utilizzare la funzione MaxAbsScaler per scalare i valori del dataset tra 0 e 1 e confrontare se
l’accuratezza ottenuta con il Decision Tree Classifier migliora (punti 3).

In [88]:
mas = MaxAbsScaler()
df_scaled = mas.fit_transform(df)

X_scaled = df.drop('price_range', axis=1)
y_scaled = df['price_range']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

dt.fit(X_train, y_train)
y_pred = dt.predict(X_train)
acc_dt_train_scaled = accuracy_score(y_pred, y_train)
y_pred = dt.predict(X_test)
acc_dt_test_scaled = accuracy_score(y_pred, y_test)
print(f'SCALED : Train: {acc_dt_train_scaled}, Accuracy test: {acc_dt_test_scaled}')
print(f'BEFORE: Accuracy train: {acc_dt_train}, Accuracy test: {acc_dt_test}')


SCALED : Train: 1.0, Accuracy test: 0.812
BEFORE: Accuracy train: 1.0, Accuracy test: 0.812


L'acc non migliora perché il dt non cambia con lo scaling

# 5
Discretizzare il valore di ram in 4 intervalli e verificare se l’accuratezza ottenuta con il
Decision Tree Classifier migliora (punti 2).

In [96]:
df['ram_group'] = pd.cut(df['ram'], bins= 4, labels=['0','1','2','3'])

X = df.drop(['price_range', 'ram'], axis=1)
y = df['price_range']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
y_pred = dt.predict(X_train)
acc_dt_train_disc = accuracy_score(y_pred, y_train)
y_pred = dt.predict(X_test)
acc_dt_test_disc = accuracy_score(y_pred, y_test)

print('---DECISION TREE---')
print(f'DISC: Accuracy train: {acc_dt_train_disc}, Accuracy test: {acc_dt_test_disc}')
print(f'Accuracy train: {acc_dt_train}, Accuracy test: {acc_dt_test}')
print(confusion_matrix(y_test, y_pred))

---DECISION TREE---
DISC: Accuracy train: 1.0, Accuracy test: 0.762
Accuracy train: 1.0, Accuracy test: 0.812
[[109  23   0   0]
 [ 21  74  23   0]
 [  1  14  86  19]
 [  0   0  18 112]]


L'accuratezza non migliora perché il dt fa la sua discretizzazione da solo, se la facciamo prima noi stiamo solo dando meno informazioni

In [97]:
X

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,pc,px_height,px_width,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,ram_group
0,842,0,2.2,0,1,0,7,0.6,188,2,2,20,756,9,7,19,0,0,1,2
1,1021,1,0.5,1,0,1,53,0.7,136,3,6,905,1988,17,3,7,1,1,0,2
2,563,1,0.5,1,2,1,41,0.9,145,5,6,1263,1716,11,2,9,1,1,0,2
3,615,1,2.5,0,0,0,10,0.8,131,6,9,1216,1786,16,8,11,1,0,0,2
4,1821,1,1.2,0,13,1,44,0.6,141,2,14,1208,1212,8,2,15,1,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,794,1,0.5,1,0,1,2,0.8,106,6,14,1222,1890,13,4,19,1,1,0,0
1996,1965,1,2.6,1,0,0,39,0.2,187,4,3,915,1965,11,10,16,1,1,1,1
1997,1911,0,0.9,1,1,1,36,0.7,108,8,3,868,1632,9,1,5,1,1,0,2
1998,1512,0,0.9,0,4,1,46,0.1,145,5,5,336,670,18,10,19,1,1,1,0


# 6
Creare una pipeline in cui il valore di ram sia discretizzato in 4 intervalli, il valore di
battery_power sia discretizzato in 10 intervalli e poi il dataset venga ricondotto a valori
nell’intervallo (0,1) e normalizzato con la funzione Normalizer. Si applichi poi un modello
DecisionTree. (punti 4) [Alternativa (punti 2): non applicare la discretizzazione]

In [115]:
df = pd.read_csv('MB/trainMobile.csv')
X = df.drop('price_range', axis=1)
y = df['price_range']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

preprocessing = ColumnTransformer(
    transformers=[
        ('disc_ram', KBinsDiscretizer(n_bins=4, encode='ordinal', strategy='uniform'), ['ram']),
        ('disc_battery', KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='uniform'), ['battery_power']),
], remainder='passthrough')
model = DecisionTreeClassifier(random_state=42)

pipeline = Pipeline(
    [
        ('preprocessing', preprocessing),
        ('scaler', MinMaxScaler()),
        ('norm', Normalizer()),
        ('model', model)
    ]
)
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_train)
acc_pipe_train = accuracy_score(y_pred, y_train)
y_pred = pipeline.predict(X_test)
acc_pipe_test = accuracy_score(y_pred, y_test)
print('---PIPELINE---')
print('Accuracy train: {:.2f}%'.format(acc_pipe_train*100))
print('Accuracy test: {:.2f}%'.format(acc_pipe_test*100))

---PIPELINE---
Accuracy train: 100.00%
Accuracy test: 74.00%


In [116]:
y

0       1
1       2
2       2
3       2
4       1
       ..
1995    0
1996    2
1997    3
1998    0
1999    3
Name: price_range, Length: 2000, dtype: int64

# 7
Si verifichi l’accuratezza ottenuta con il file test.csv. Controllare le colonne del file. I risultati
corretti sono nel file class.csv. (punti 2).

In [117]:
X = pd.read_csv('MB/testMobile.csv')
classes = pd.read_csv('MB/class.csv')
y = classes['class']

y_pred = pipeline.predict(X)
acc = accuracy_score(y, y_pred)
print('---PIPELINE---')
print('Accuracy train: {:.2f}%'.format(acc*100))
print('Accuracy test: {:.2f}%'.format(acc*100))

---PIPELINE---
Accuracy train: 75.50%
Accuracy test: 75.50%
