In [321]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import numpy as np
import plotly.express as px


### Project description

We are working for mobile carrier Megaline that wants to develop a model that would analyze subscribers' behavior and recommend one of Megaline's newer plans: Smart or Ultra.
Using behavior data about subscribers who have already switched to the new plans we will develop a model  that will pick the right plan for each user and produce an accuracy resulut that is greater than threshold of 0.75. 


Essentially, we face a classification problem...

Let's start by importing the data

In [322]:
df = pd.read_csv("/datasets/users_behavior.csv")

#df = pd.read_csv(r"C:\Users\vlady\OneDrive\Desktop\LEARNING\Practivum\users_behavior.csv")

### Checking the data

In [323]:
df.sample(10)

Unnamed: 0,calls,minutes,messages,mb_used,is_ultra
1763,51.0,356.79,67.0,11568.16,1
2231,61.0,395.33,146.0,15215.51,1
2782,125.0,801.78,0.0,7517.55,1
2159,182.0,1225.19,99.0,13996.26,1
815,87.0,655.16,10.0,20192.8,0
1771,53.0,351.5,31.0,9766.09,0
140,94.0,647.46,57.0,17271.53,0
1738,87.0,550.48,70.0,13843.33,0
3199,104.0,702.47,0.0,23611.32,0
1899,27.0,164.14,97.0,17718.93,1


In [324]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3214 entries, 0 to 3213
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   calls     3214 non-null   float64
 1   minutes   3214 non-null   float64
 2   messages  3214 non-null   float64
 3   mb_used   3214 non-null   float64
 4   is_ultra  3214 non-null   int64  
dtypes: float64(4), int64(1)
memory usage: 125.7 KB


In [325]:
print("There are", df.isnull().sum().sum(), "nans in df file ")
print("There are", df.duplicated(keep='first').sum(), "duplicates in our df file ")


There are 0 nans in df file 
There are 0 duplicates in our df file 


### Subseting the data

In [326]:
# First to featurs and target:
features = df.drop("is_ultra", axis = 1)
target = df.is_ultra 

In [327]:
# Then to training, validation and test subsets
def train_val_test_split(X, y, train_size, val_size, test_size):
    X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size = test_size)
    relative_train_size = train_size / (val_size + train_size)
    X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val,
                                                      train_size = relative_train_size, test_size = 1-relative_train_size)
    return X_train, X_val, X_test, y_train, y_val, y_test


In [328]:
X_train, X_val, X_test, y_train, y_val, y_test = train_val_test_split(features, target, train_size = 0.60, val_size = 0.2, test_size= 0.2)

### Let's check a sample of the subsets:

In [329]:
X_train.shape

(1928, 4)

In [330]:
X_val.shape

(643, 4)

In [331]:
y_test.shape

(643,)

### Working with Decision Tree

In order to find the best depth for our DesicionTree model, let's iterate over depths bt 1 and 20 and plot the results:

In [332]:
treee_predictions = {}
for depth in range(1, 21):
    tree_model = DecisionTreeClassifier( max_depth=depth, random_state=123456)
    tree_model.fit(X_train, y_train)
    predictions_valid = tree_model.predict(X_val)
    treee_predictions.update({str(depth): accuracy_score(y_val, predictions_valid)})
    print('max_depth =', depth, ': ', end='')
    print(accuracy_score(y_val, predictions_valid))
    
tree_results = pd.DataFrame.from_dict(treee_predictions, orient='index')
fig = px.line(tree_results, x=tree_results.index, y = 0, title='Test 1: Accuracy vs Tree depth / DecisionTreeClassifier')
fig.show()

max_depth = 1 : 0.7542768273716952
max_depth = 2 : 0.7822706065318819
max_depth = 3 : 0.7931570762052877
max_depth = 4 : 0.7916018662519441
max_depth = 5 : 0.7947122861586314
max_depth = 6 : 0.7838258164852255
max_depth = 7 : 0.7962674961119751
max_depth = 8 : 0.7916018662519441
max_depth = 9 : 0.8009331259720062
max_depth = 10 : 0.7962674961119751
max_depth = 11 : 0.8009331259720062
max_depth = 12 : 0.7993779160186625
max_depth = 13 : 0.7947122861586314
max_depth = 14 : 0.80248833592535
max_depth = 15 : 0.7962674961119751
max_depth = 16 : 0.7822706065318819
max_depth = 17 : 0.7698289269051322
max_depth = 18 : 0.7744945567651633
max_depth = 19 : 0.7651632970451011
max_depth = 20 : 0.7791601866251944


#### The graph shows us that different depths produce different accuracies. 
#### The best accuracy is bt 9 and 14. So wich one to choose?
#### In order to pick the best parameter, our desionion must be statistically robust. Let's cancel the random state parameter and run our test 500 times (!). 

### Let's repeat this experiment 500 times for each depth bt 1 and 20 and find the mean of accuracy of each depth:

* the code takes about 2 minutes to run

<div class="alert alert-success">
<b>Reviewer's comment</b>

Pretty cool idea, but it won't be feasible with larger grids or bigger models, where it's rarely possible to even consider all possible hyperparameter combinations, much less run the same grid search 500 times :)
    
Also, another way to make the results more robust is to use [cross-validation](https://scikit-learn.org/stable/modules/cross_validation.html). It should be covered in one of the later sprints.

</div>

In [340]:
treee_predictions = {}
best_score = 0
best_depth = 0

for test in range(1,500):
    for depth in range(1, 21):
        tree_model = DecisionTreeClassifier( max_depth=depth) # set the model
        tree_model.fit(X_train, y_train) # training the model 
        predictions_valid = tree_model.predict(X_val) # prediction on valid test
        score = accuracy_score(y_val, predictions_valid)
        
        treee_predictions.update({str(depth): score}) #Saving results
        best_score = np.mean(best_score + score)
        best_depth = depth


#### Let's put it on the graph: 

In [341]:
tree_results = pd.DataFrame.from_dict(treee_predictions, orient='index')
fig = px.line(tree_results, x=tree_results.index, y = 0, title='Test 2: Accuracy vs Tree depth / DecisionTreeClassifier')
fig.show()

#### Accuracy peaks at 9 depths. 10 or 11 would be our second choice. Since we are running without random state, we will get a slightly different tree number each time. But statistically, this is a more robust approach. 

### Now that we have our best Desicion Tree model, let's try Forest:

### Working with Random Forest

#### Here, our main parameter that will determine the accuracy of the test is the number of trees in the forest. In order to pick the optimal number we will iterate 100 times on models that have tree number bt 1 and 20 and then plot the means of accuracies for each num of trees. We run our forest in a random free state. 


In [342]:
best_score = 0
best_est = 0
forest_predictions = {}

for test in range(1,100):
    for est in range(1, 21): # choose hyperparameter range
        forest_model = RandomForestClassifier( n_estimators=est, max_depth = 9) # set number of trees
        forest_model.fit(X_train, y_train)
        score = forest_model.score(X_val, y_val) # calculation of accuracy score on validation set
        forest_predictions.update({ str(est): score })

        best_score = np.mean(best_score + score)
        best_est = est


#### Let's print the results:

In [343]:
forest_results = pd.DataFrame.from_dict(forest_predictions, orient='index')
fig = px.line(forest_results, x=forest_results.index, y = 0, title='Test 3: Accuracy vs num of trees / RandomForestClassifier')
fig.show()

#### By running the cell above several times (without random state) we narrowed down our best performing number of trees to 16.


### Logistic Regression

In [344]:

model = LogisticRegression(random_state=12345, solver='lbfgs' )
model.fit(X_train, y_train)
score = model.score(X_val, y_val)
print("Accuracy of Logistic Regression on the validation set: {}".format(score))


Accuracy of Logistic Regression on the validation set: 0.6967340590979783


<div class="alert alert-success">
<b>Reviewer's comment</b>

Great, you tried a couple of different models and tuned their hyperparameters using the validation set.

</div>

### Moving to tests

The logistic regression model performed the worst in our pre-testing. Therefore for our final evaluation we will only use the tree and the forest models.
For the DecisionTreeClassifier model, the optimal depth is 8.
For the RandomForestClassifier model,  the optimal performance is 16  trees.

We will test and compare the performance of both models based on these setups.

Just before we do that let's "feed" our forest_model and tree_model their best params:

#### Feeding the Tree Model wt best param and testing on test subset:

In [358]:

tree_model = DecisionTreeClassifier(max_depth=9, random_state = 987)
tree_model.fit(X_train, y_train)
tree_score = tree_model.score(X_test, y_test)
    
print("Tree test score", tree_score)


Tree test score 0.7993779160186625


#### Feeding the Forest Model wt best param and testing on test subset:

In [357]:


forest_model = RandomForestClassifier( n_estimators= 16, max_depth=9, random_state = 987 ) 
forest_model.fit(X_train, y_train)
forest_score = forest_model.score(X_test, y_test) # calculation of accuracy score on validation set
    
print("Forest test score", forest_score)


Forest test score 0.8258164852255054


Our Forest model generated a slightly better result. 82% accuracy

<div class="alert alert-success">
<b>Reviewer's comment</b>

The final models were evaluated on the test set

</div>

### Sanity check:

In [359]:
df.columns

Index(['calls', 'minutes', 'messages', 'mb_used', 'is_ultra'], dtype='object')

In [360]:
mb_med = df.mb_used.median()
messages_med = df.messages.median()
minutes_med =  df.minutes.median()

In [361]:
def sanity(data):
    minutes = data['minutes']
    messages = data['messages']
    mb = data['mb_used']
    if minutes > minutes_med:
        return 0
    elif messages > messages_med:
        return 1 
    elif mb > mb_med:
        return 0
    else:
        return 1
        

predictions_sanity = features.apply(sanity, axis=1)
result_sanity = accuracy_score(target, predictions_sanity)
result_sanity

0.5

By running sanity check we can validate the fact that the results that we have accomplished in our final test (above 80% accuracy) are in fact due to the performance of our model and not a pure chance. Simply shuffling the target column gives us 0.5% accuracy. 