# Model Selection
- Logistic Regression
- Random Forest
- Gradient Boosting

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

# Logistic Regression 

In [3]:
df = pd.read_csv("data/apps_info.csv")
df["success_class"] = df["score"].apply(lambda x: "high" if x >= 4.2 else "not_high")

In [4]:
X = df[["ratings_count", "downloads", "content_rating", "section", "categories"]]
y = df["success_class"]

In [6]:
numeric_features = ["ratings_count", "downloads"]
categorical_features = ["content_rating", "section", "categories"]

preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_features),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features),
    ]
)

clf = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", LogisticRegression(max_iter=1000))
])


In [7]:
X_train, X_test, y_train, y_test = train_test_split(
    X, 
    y, 
    stratify=y, 
    test_size=0.2, 
    random_state=42
)

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)


In [8]:
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Confusion Matrix:
[[35  1]
 [ 8  0]]

Classification Report:
              precision    recall  f1-score   support

        high       0.81      0.97      0.89        36
    not_high       0.00      0.00      0.00         8

    accuracy                           0.80        44
   macro avg       0.41      0.49      0.44        44
weighted avg       0.67      0.80      0.72        44

