In [4]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import pickle
import gdown
import os

# Function to create directory if it does not exist
def create_directory(directory):
    if not os.path.exists(directory):
        os.makedirs(directory)

# Function to download dataset from Google Drive
def download_dataset_from_gdrive(gdrive_url, output_path):
    # Ensure the directory for the output file exists
    create_directory(os.path.dirname(output_path))

    if not os.path.exists(output_path):
        print("Downloading dataset from Google Drive...")
        gdown.download(gdrive_url, output_path, quiet=False)
        print("Download complete!")
    else:
        print("Dataset already exists. Skipping download.")

# Google Drive URL (replace with your actual file ID)
gdrive_url = "https://drive.google.com/uc?id=FILE_ID"
dataset_path = "data/Expresso_churn_dataset.csv"

# Download dataset
download_dataset_from_gdrive(gdrive_url, dataset_path)

# Load dataset
df = pd.read_csv(dataset_path)

# Data preprocessing
def preprocess_data(df):
    # Handle missing values, encode categorical columns, and handle outliers
    df_filled = df.fillna(0)
    categorical_columns = ['user_id', 'REGION', 'TENURE', 'MRG', 'TOP_PACK']

    # Encode categorical features
    label_encoder = LabelEncoder()
    for col in categorical_columns:
        df_filled[col] = df_filled[col].astype(str)
        df_filled[col] = label_encoder.fit_transform(df_filled[col])

    return df_filled

# Train model
def train_model():
    df_cleaned = preprocess_data(df)
    X = df_cleaned.drop('CHURN', axis=1)
    y = df_cleaned['CHURN']

    # Split dataset
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Train classifier
    clf = RandomForestClassifier()
    clf.fit(X_train, y_train)

    # Test accuracy
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print("Model Accuracy:", accuracy)

    # Ensure models directory exists
    create_directory("models")

    # Save model
    with open("models/main_trained_model_1.sav", 'wb') as f:
        pickle.dump(clf, f)

if __name__ == "__main__":
    train_model()


Dataset already exists. Skipping download.
Model Accuracy: 0.8528632111603723
