# Classification problem

# Research question: 
Can you predict an App success based on its features from Google Play Store (success is defined as having a mean rating above 4.2 and at least 100 000 installs). Data set from Kaggel(web scraping).
https://www.kaggle.com/lava18/google-play-store-apps
Method: Logistic regression model

In [None]:
#Importing the packages 
import datetime
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn.metrics import log_loss
from sklearn.metrics import precision_recall_curve
from sklearn import metrics

# Hide deprecated warnings
import warnings
warnings.filterwarnings('ignore')

# Mount local files
from google.colab import drive
drive.mount('/gdrive')

## Load the data

In [None]:
#importing the data
fv=pd.read_csv("/gdrive/My Drive/AI/CURRICULUM/Giovanna/170619/170619final_feature_vector.csv")

In [None]:
# Discard groups of features to see how the model behaves and which features were kept as relevant to predict success
# developer_relevant_fv = [i for i in fv.columns.tolist() if (i[:8] != 'Category' and i[:4] != 'year' and i[:5] != 'month')]
# non_category_features = [i for i in fv.columns.tolist() if i[:8] != 'Category']
# non_year_features = [i for i in fv.columns.tolist() if i[:8] != 'Category']
# fv = fv[developer_relevant_fv]

In [None]:
fv.shape

In [None]:
fv.columns

## Train/Test Split

In [None]:
X = fv.drop(['Successful_App'], axis=1)
y = fv['Successful_App']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Logistic Regression - Grid Search & Cross Validation

See the list of scoring functions [here](https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter)

In [None]:
# Initialize the model
logistic = LogisticRegression()

# Define the values of the hyperparameters to optimize
penalty = ['l1']
C=[0.1, 0.2, 0.4, 0.6, 0.8, 1] # C = 1/λ
solver=['liblinear', 'saga']
hyperparameters = dict(penalty=penalty, C=C, solver=solver)

# Initialize GridSearch/Cross-validation
clf = GridSearchCV(logistic, hyperparameters, cv=5, scoring='accuracy')

# Run the optimization
best_model = clf.fit(X_train, y_train)

In [None]:
# Results
best_model.cv_results_

In [None]:
# Set of hyperparameters from the model that perform the best on average from all cross validations
best_model.best_params_

In [None]:
# Average value for the metric for the model evaluation in the best set of hyperparameters
best_model.best_score_

### Extract factors that are relevant after regularization and store them for analysis

In [None]:
# Save best regularized coefficients for logistic regression analysis
logistic = LogisticRegression(penalty='l1', C=0.4, solver='liblinear')
logistic.fit(X_train, y_train)
coefficients = pd.DataFrame({'features': X_train.columns, 'coefficients': logistic.coef_[0]})
coefficients.to_csv("logistic_regularization_coefficients.csv", index=False, encoding='utf-8')

In [None]:
relevance_factors = coefficients[coefficients['coefficients']>0].sort_values(by='coefficients')
_=sns.barplot(x='coefficients', y='features', data=relevance_factors, color='blue')

#### Visualize the results of hyperparameter optimization grid search

In [None]:
params = best_model.cv_results_['params']
#train_scores = best_model.cv_results_['mean_train_score']
test_scores = best_model.cv_results_['mean_test_score']
times = best_model.cv_results_['mean_fit_time']

GS_performance = pd.DataFrame({'C': [i['C'] for i in params],
                                 'solver': [i['solver'] for i in params],
#                                 'train_score': train_scores,
                                 'test_score': test_scores,
                                 'fit_time': times})

In [None]:
# _=sns.heatmap(GS_performance[['C','solver','train_score']].pivot_table(values='train_score', index='C', 
#                           columns='solver'), annot=True, fmt='.10f', cmap='Blues')
# _=plt.title("Performance on the model based on hyperparmeters with the train set")

In [None]:
_=sns.heatmap(GS_performance[['C','solver','test_score']].pivot_table(values='test_score', index='C', 
                          columns='solver'), annot=True, fmt='.10f', cmap='Blues')
_=plt.title("Performance on the model based on hyperparmeters with the test set")

In [None]:
_=sns.heatmap(GS_performance[['C','solver','fit_time']].pivot_table(values='fit_time', index='C', columns='solver'),
             annot=True, fmt='.6f', cmap='Blues')
_=plt.title("Performance on the model based on fit_time")

## Logistic Regression trained - Final value of the cross entropy (loss function)

**It requires the predicted probability of the class being class 1!!**

```-log P(yt|yp) = -(yt log(yp) + (1 - yt) log(1 - yp))```

In [None]:
# Use only the training set to calculate it
y_pred_train = best_model.predict_proba(X_train)
y_pred_test = best_model.predict_proba(X_test)

# Calculate cross entropy of true vs predict
cross_entropy_train = log_loss(y_train, y_pred_train)
cross_entropy_test = log_loss(y_test, y_pred_test)

In [None]:
print("The logistic regression was optimized until reaching a loss of {:.2f} with the trainig set.\nThe model has a cross entropy value of {:.2f} when using test data.".format(cross_entropy_train, cross_entropy_test))

In [None]:
# Predict if the interviewed people in the test set are taking treatment of not
y_pred = best_model.predict(X_test)

## Logistic Regression - Best Model Evaluation

![precision_recall](prec_recall.png "Title")

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
# Normalized (by the number of interviewed people in each class) confusion matrix
cm = confusion_matrix(y_test, y_pred)
cm = cm.astype('float') / cm.sum(axis=0)

_=sns.heatmap(cm, annot=True, cmap='Blues', fmt='.2f')
_=plt.xlabel('Predicted')
_=plt.ylabel('True')
_=plt.title("Confusion Matrix True App success vs predicted")

### What is the best probability threshold to decide whether an App is successful or not?

In [None]:
best_model.predict(X_test)

In [None]:
# Predict the probabilities of being successful
y_pred_test = best_model.predict_proba(X_test)

In [None]:
_=plt.figure(figsize=(10,3))
_=plt.scatter(range(0, len(y_pred_test[:,1])), np.sort(y_test.as_matrix()), color='b')
_=plt.scatter(range(0, len(y_pred_test[:,1])), np.sort(y_pred_test[:,1]), color='r')
_=plt.title("Predicted probabilities of app success for the test dataset")

In [None]:
# Visualize how the precision and the recall changes when moving the threshold
precision, recall, thresholds = precision_recall_curve(y_test, y_pred_test[:, 1]) 
pr_auc = metrics.auc(recall, precision)

plt.title("Precision-Recall vs Threshold Chart")
plt.plot(thresholds, precision[: -1], "b--", label="Precision")
plt.plot(thresholds, recall[: -1], "r--", label="Recall")
plt.ylabel("Precision, Recall")
plt.xlabel("Threshold")
plt.legend(loc="lower left")
_=plt.ylim([0,1])

### What is the area under the curve according to the optimal threshold?

![ROC_curve](ROC_curves.png "Title")

In [None]:
fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred_test[:, 1])

In [None]:
auc = roc_auc_score(y_test, y_pred_test[:, 1])
print('AUC: %.3f' % auc)

# calculate roc curve
fpr, tpr, thresholds = roc_curve(y_test, y_pred_test[:, 1])

# plot random case
_=plt.figure(figsize=(10, 8))
_=plt.plot([0, 1], [0, 1], linestyle='--')

# plot the roc curve for the model
_=plt.plot(fpr, tpr, marker='.')
_=plt.title("ROC curve for correct prediction of App Success with log regression")

In [None]:
_=plt.figure(figsize=(10, 8))
_=plt.scatter(range(0, len(thresholds)), thresholds)
_=plt.title('Thresholds over the probability distributions to calculate the integral')