## Этап 1: Просмотр данных и предобработка 

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from joblib import dump

In [2]:
df = pd.read_csv('/datasets/users_behavior.csv')
df.head()

Unnamed: 0,calls,minutes,messages,mb_used,is_ultra
0,40.0,311.9,83.0,19915.42,0
1,85.0,516.75,56.0,22696.96,0
2,77.0,467.66,86.0,21060.45,0
3,106.0,745.53,81.0,8437.39,1
4,66.0,418.74,1.0,14502.75,0


In [3]:
df_train, df_valid = train_test_split(df, test_size=0.25, random_state=12345)
df_control, df_valid = train_test_split(df_valid, test_size=0.5, random_state=12345)

In [4]:
df_features = df.drop('is_ultra', axis = 1)
df_target = df['is_ultra']

df_train_features = df_train.drop('is_ultra', axis = 1)
df_train_target = df_train['is_ultra']

df_valid_features = df_valid.drop('is_ultra', axis = 1)
df_valid_target = df_valid['is_ultra']

df_control_features = df_control.drop('is_ultra', axis = 1)
df_control_target = df_control['is_ultra']

### Выводы по загруженным данным и подготовке выборок. 
Исходный фрейм был разделён через функцию train_test_split: обучающей выборке отведено 75% (чем больше, тем лучше!).    
Оставшую часть разделил по 50%, посколь задание требовало сформировать помимо валидационной выборки ещё одну - контрольную.  
Все выборки были разделе на features (без целевого столбца категории is_ultra) и target (только is_ultra).  

## Этап 2: Проверка качества моделей  


#### №1: Дерево решений

In [5]:
from sklearn.tree import DecisionTreeClassifier
d_tree_clss__model = DecisionTreeClassifier(random_state=12345, max_depth=7)
d_tree_clss__model.fit(df_train_features, df_train_target)



DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=7,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=12345, splitter='best')

In [6]:
#пробую на новых данных + accuracy
from sklearn.metrics import accuracy_score
dec_tree_answer = d_tree_clss__model.predict(df_valid_features)
dec_tree_accuracy = accuracy_score(df_valid_target, dec_tree_answer)



In [7]:
for i in range(1,15,1):
    d_tree_clss__model = DecisionTreeClassifier(random_state=12345, max_depth=i)
    d_tree_clss__model.fit(df_train_features, df_train_target)
    dec_tree_answer = d_tree_clss__model.predict(df_valid_features)
    dec_tree_accuracy = accuracy_score(df_valid_target, dec_tree_answer)
    print('If max_depth =',i,'Accuracy:', dec_tree_accuracy)

If max_depth = 1 Accuracy: 0.736318407960199
If max_depth = 2 Accuracy: 0.7736318407960199
If max_depth = 3 Accuracy: 0.7786069651741293
If max_depth = 4 Accuracy: 0.763681592039801
If max_depth = 5 Accuracy: 0.763681592039801
If max_depth = 6 Accuracy: 0.753731343283582
If max_depth = 7 Accuracy: 0.7910447761194029
If max_depth = 8 Accuracy: 0.7935323383084577
If max_depth = 9 Accuracy: 0.7985074626865671
If max_depth = 10 Accuracy: 0.7935323383084577
If max_depth = 11 Accuracy: 0.7960199004975125
If max_depth = 12 Accuracy: 0.7835820895522388
If max_depth = 13 Accuracy: 0.7786069651741293
If max_depth = 14 Accuracy: 0.763681592039801


 ### Выводы по модели дерево решений:  
 Параметр Accuracy удалось довести до 0.7985074626865671 при гиперпараметре max_depth равном 7.  
 Условие задачи (Accuracy >0.75) выполняет при max_depth равном 2.

#### №2: Случайный лес

In [8]:
from sklearn.ensemble import RandomForestClassifier

random_forest_model = RandomForestClassifier(random_state=12345, n_estimators=5, max_depth = 6)
random_forest_model.fit(df_train_features, df_train_target)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=6, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=5,
                       n_jobs=None, oob_score=False, random_state=12345,
                       verbose=0, warm_start=False)

In [9]:
random_forest_answer = random_forest_model.predict(df_valid_features)
random_forest_accuracy = accuracy_score(df_valid_target, random_forest_answer)
random_forest_accuracy

0.8034825870646766

In [10]:
for e in range(1,30):
    random_forest_model = RandomForestClassifier(random_state=12345, n_estimators= e)
    random_forest_model.fit(df_train_features, df_train_target)
    random_forest_answer = random_forest_model.predict(df_valid_features)
    random_forest_accuracy = accuracy_score(df_valid_target, random_forest_answer)
    print('If n_estimators =',e,'Accuracy:', random_forest_accuracy)

If n_estimators = 1 Accuracy: 0.753731343283582
If n_estimators = 2 Accuracy: 0.7686567164179104
If n_estimators = 3 Accuracy: 0.7562189054726368
If n_estimators = 4 Accuracy: 0.7711442786069652
If n_estimators = 5 Accuracy: 0.7686567164179104
If n_estimators = 6 Accuracy: 0.7786069651741293
If n_estimators = 7 Accuracy: 0.7686567164179104
If n_estimators = 8 Accuracy: 0.7761194029850746
If n_estimators = 9 Accuracy: 0.7736318407960199
If n_estimators = 10 Accuracy: 0.7835820895522388
If n_estimators = 11 Accuracy: 0.7810945273631841
If n_estimators = 12 Accuracy: 0.7885572139303483
If n_estimators = 13 Accuracy: 0.7736318407960199
If n_estimators = 14 Accuracy: 0.7810945273631841
If n_estimators = 15 Accuracy: 0.7835820895522388
If n_estimators = 16 Accuracy: 0.7835820895522388
If n_estimators = 17 Accuracy: 0.7736318407960199
If n_estimators = 18 Accuracy: 0.7810945273631841
If n_estimators = 19 Accuracy: 0.7786069651741293
If n_estimators = 20 Accuracy: 0.7860696517412935
If n_estim

In [11]:
for e in range(1,10):
    for i in range(1,10):
        random_forest_model = RandomForestClassifier(random_state=12345, n_estimators= e, max_depth= i)
        random_forest_model.fit(df_train_features, df_train_target)
        random_forest_answer = random_forest_model.predict(df_valid_features)
        random_forest_accuracy = accuracy_score(df_valid_target, random_forest_answer)
        if random_forest_accuracy > 0.80:
            print('If n_estimators =',e,'max_depth =',i,'Accuracy:', random_forest_accuracy)

If n_estimators = 5 max_depth = 6 Accuracy: 0.8034825870646766
If n_estimators = 5 max_depth = 7 Accuracy: 0.8009950248756219
If n_estimators = 6 max_depth = 6 Accuracy: 0.8034825870646766
If n_estimators = 6 max_depth = 7 Accuracy: 0.8009950248756219
If n_estimators = 7 max_depth = 6 Accuracy: 0.8034825870646766
If n_estimators = 7 max_depth = 7 Accuracy: 0.8109452736318408
If n_estimators = 7 max_depth = 9 Accuracy: 0.8009950248756219
If n_estimators = 8 max_depth = 6 Accuracy: 0.8059701492537313
If n_estimators = 8 max_depth = 7 Accuracy: 0.8059701492537313
If n_estimators = 8 max_depth = 9 Accuracy: 0.8009950248756219
If n_estimators = 9 max_depth = 6 Accuracy: 0.8084577114427861
If n_estimators = 9 max_depth = 7 Accuracy: 0.8084577114427861
If n_estimators = 9 max_depth = 9 Accuracy: 0.8009950248756219


### Выводы по модели случайный лес:  
 Параметр Accuracy удалось довести до 0.8034825870646766 при гиперпараметрах n_estimators = 5, max_depth = 6.    
 Условие задачи (Accuracy >0.75) выполняет при n_estimators = 1, max_depth = 1.

#### №3: Логичстическая регрессия

In [12]:
#логистиеская регрессия
from sklearn.linear_model import LogisticRegression
log_reg_model = LogisticRegression(random_state=12345, solver = 'lbfgs')
log_reg_model.fit(df_train_features, df_train_target)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=12345, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [13]:
log_reg_model_answer = log_reg_model.predict(df_valid_features)
log_reg_model_accuracy = log_reg_model.score(df_valid_features, df_valid_target)
log_reg_model_accuracy

0.7039800995024875

In [14]:
C_param_range = [0.0001, 0.001,0.01,0.1,1,10,100]

for z in C_param_range:
    log_reg_model = LogisticRegression(random_state=12345, solver = 'lbfgs', C = z)
    log_reg_model.fit(df_train_features, df_train_target)
    log_reg_model_answer = log_reg_model.predict(df_valid_features)
    log_reg_model_accuracy = log_reg_model.score(df_valid_features, df_valid_target)
    print('If C =',z,'Accuracy:', log_reg_model_accuracy)

If C = 0.0001 Accuracy: 0.7437810945273632
If C = 0.001 Accuracy: 0.7039800995024875
If C = 0.01 Accuracy: 0.7039800995024875
If C = 0.1 Accuracy: 0.7039800995024875
If C = 1 Accuracy: 0.7039800995024875
If C = 10 Accuracy: 0.7039800995024875
If C = 100 Accuracy: 0.7039800995024875


In [15]:
solvers = ['newton-cg', 'sag', 'saga', 'lbfgs'] 
penalties = ['l1', 'l2', 'elasticnet', 'none']
for p in penalties:
    try:
        log_reg_model = LogisticRegression(penalty = p, solver = 'lbfgs', random_state=12345)
        log_reg_model.fit(df_train_features, df_train_target)
        log_reg_model_answer = log_reg_model.predict(df_valid_features)
        log_reg_model_accuracy = log_reg_model.score(df_valid_features, df_valid_target)
        print('If penalty =', p,'Accuracy:', log_reg_model_accuracy)
    except:
        print('error')

error
If penalty = l2 Accuracy: 0.7039800995024875
error
If penalty = none Accuracy: 0.7039800995024875


 ### Выводы по модели логистической регрессии:  
 Параметр Accuracy составил 0.7039800995024875.  
 Изменение параметро solver, penaly не привело к улучшению результат.  
 Некоторые из этих параметров не совместимы друг с другом.
 Условие задачи (Accuracy >0.75) не выполняется.
  

### Проверка качества моделей на тестовой выборке.  


In [16]:
dec_tree_control_answer = d_tree_clss__model.predict(df_control_features)
dec_tree_control_accuracy = accuracy_score(df_control_target, dec_tree_control_answer)
dec_tree_control_accuracy

0.7562189054726368

In [17]:
random_forest_control_answer = random_forest_model.predict(df_control_features)
random_forest_control_accuracy = accuracy_score(df_control_target, random_forest_control_answer)
random_forest_control_accuracy

0.7985074626865671

In [18]:
log_reg_model_control_answer = log_reg_model.predict(df_control_features)
log_reg_model_control_answer
log_reg_model_control_accuracy = log_reg_model.score(df_control_features, df_control_target)
log_reg_model_control_accuracy

0.7039800995024875

In [19]:
model_names = ['dec_tree/valid','dec_tree/control','rnd_for/valid','rnd_for/control',
               'log_reg/valid','log_reg/control']
accuracy_data = [dec_tree_accuracy, dec_tree_control_accuracy, random_forest_accuracy, 
    random_forest_control_accuracy, log_reg_model_accuracy, log_reg_model_control_accuracy]

accuracy_result = pd.DataFrame.from_dict({'model/sample': model_names, 'Accuracy': accuracy_data})

In [20]:
accuracy_result

Unnamed: 0,model/sample,Accuracy
0,dec_tree/valid,0.763682
1,dec_tree/control,0.756219
2,rnd_for/valid,0.800995
3,rnd_for/control,0.798507
4,log_reg/valid,0.70398
5,log_reg/control,0.70398


### Выводы по моделям  
Лучший результат показала модель Случайный лес с гиперпараметрами n_estimators = 5, max_depth = 6  

## Этап 3: Проверка модели на вменяемость


In [21]:
from sklearn.metrics import mean_squared_error

In [22]:
log_reg_mse = mean_squared_error(df_control_target, log_reg_model_control_answer)
log_reg_rmse = log_reg_mse**0.5

random_forest_mse = mean_squared_error(df_control_target, random_forest_control_answer)
random_forest_rmse = random_forest_mse**0.5

dec_tree_mse = mean_squared_error(df_valid_target, dec_tree_control_answer)
dec_tree_rmse = dec_tree_mse**0.5

print(log_reg_rmse, random_forest_rmse, dec_tree_rmse)

0.5440771089629781 0.4488792012484348 0.6367673333454853


#### Выводы по дополнительному заданию:  
В связи с тем, что при проверке адекватности нам нужно прибегнут к случайным величинам, за основу мы берём вероятность правильного  
прогноза равную 0.5 (так как мы можем получить либо 0, либо 1). Проверка через RMSE показала результаты, которые противоречат 
результата исследования моделей по параметру accuracy:  
RMSE логистической регрессии = 0.5440771089629781  
RMSE случайного леса = 0.4488792012484348  
RMSE дерево решений = 0.6367673333454853  
Таким образом дерево решение наиболее вменяема или я сделал что-то неправильно.  

Спасибо за внимание.
