In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import log_loss
from sklearn.model_selection import cross_val_score, train_test_split


In [2]:
# Load the dataset
df = pd.read_csv('trainhackathon.csv')
df_test=pd.read_csv('Testhackathon.csv')
df.head()

Unnamed: 0,user_id,REGION,TENURE,MONTANT,FREQUENCE_RECH,REVENUE,ARPU_SEGMENT,FREQUENCE,DATA_VOLUME,ON_NET,ORANGE,TIGO,ZONE1,ZONE2,MRG,REGULARITY,TOP_PACK,FREQ_TOP_PACK,CHURN
0,dcf68cc2fb515ccad7d8b9b3bd80ee2a4b270063,SAINT-LOUIS,K > 24 month,17000.0,32.0,18000.0,6000.0,34.0,,97.0,355.0,6.0,,,NO,62,All-net 500F=2000F;5d,35.0,0
1,71c44b5ba328db5c4192a80f7cf8f244d9350ed0,,K > 24 month,4300.0,29.0,4427.0,1476.0,37.0,1764.0,8.0,3.0,0.0,,2.0,NO,40,"Data: 100 F=40MB,24H",22.0,0
2,ce46411b1526c94f20a383b8cb188f8d27f82a0a,TAMBACOUNDA,K > 24 month,1500.0,3.0,1500.0,500.0,3.0,,30.0,30.0,,,,NO,32,All-net 500F=2000F;5d,3.0,0
3,f467cdb6669818373c26c2bad44e01ba66f97d21,FATICK,K > 24 month,1500.0,3.0,2497.0,832.0,4.0,0.0,159.0,45.0,19.0,,,NO,18,On net 200F=Unlimited _call24H,3.0,0
4,ec45e1a1888a32b5dcce0954cfec20c6e037db31,FATICK,K > 24 month,,,498.0,166.0,3.0,1.0,1.0,3.0,,,,NO,50,,,0


In [3]:
 df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400000 entries, 0 to 399999
Data columns (total 19 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   user_id         400000 non-null  object 
 1   REGION          242480 non-null  object 
 2   TENURE          400000 non-null  object 
 3   MONTANT         259723 non-null  float64
 4   FREQUENCE_RECH  259723 non-null  float64
 5   REVENUE         265337 non-null  float64
 6   ARPU_SEGMENT    265337 non-null  float64
 7   FREQUENCE       265337 non-null  float64
 8   DATA_VOLUME     203146 non-null  float64
 9   ON_NET          254181 non-null  float64
 10  ORANGE          233683 non-null  float64
 11  TIGO            160614 non-null  float64
 12  ZONE1           31690 non-null   float64
 13  ZONE2           25513 non-null   float64
 14  MRG             400000 non-null  object 
 15  REGULARITY      400000 non-null  int64  
 16  TOP_PACK        232671 non-null  object 
 17  FREQ_TOP_P

In [4]:
df.shape

(400000, 19)

In [5]:
df.describe()

Unnamed: 0,MONTANT,FREQUENCE_RECH,REVENUE,ARPU_SEGMENT,FREQUENCE,DATA_VOLUME,ON_NET,ORANGE,TIGO,ZONE1,ZONE2,REGULARITY,FREQ_TOP_PACK,CHURN
count,259723.0,259723.0,265337.0,265337.0,265337.0,203146.0,254181.0,233683.0,160614.0,31690.0,25513.0,400000.0,232671.0,400000.0
mean,5522.971346,11.503733,5505.487757,1835.167658,13.951835,3369.763441,275.917586,95.532927,23.134608,7.874282,7.187003,28.046502,9.254209,0.18711
std,7099.64063,13.275514,7175.802367,2391.92929,14.679943,12281.867504,873.593034,204.733272,63.061871,35.642843,26.964028,22.282773,12.305563,0.39
min,20.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
25%,1000.0,2.0,1000.0,333.0,3.0,0.0,5.0,7.0,2.0,0.0,0.0,6.0,2.0,0.0
50%,3000.0,6.0,3000.0,1000.0,9.0,267.0,27.0,29.0,6.0,1.0,2.0,24.0,5.0,0.0
75%,7300.0,15.0,7340.0,2447.0,19.0,2925.0,155.0,99.0,20.0,3.0,5.0,51.0,12.0,0.0
max,226550.0,133.0,233413.0,77804.0,91.0,934576.0,45011.0,6788.0,2758.0,1657.0,1011.0,62.0,629.0,1.0


In [6]:
X = df.drop(columns=['user_id', 'CHURN'])
y = df['CHURN']
X_test1=df_test.drop(columns=['user_id'])

In [7]:
num = ['MONTANT', 'FREQUENCE_RECH', 'REVENUE', 'ARPU_SEGMENT', 'FREQUENCE', 'DATA_VOLUME', 
                      'ON_NET', 'ORANGE', 'TIGO', 'ZONE1', 'ZONE2', 'FREQ_TOP_PACK']
# Define transformers
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

In [8]:
cat= ['REGION', 'TENURE', 'MRG', 'TOP_PACK']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [9]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, num),
        ('cat', categorical_transformer, cat)])


In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1111)

In [11]:
# Define a simplified model pipeline
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000, random_state=1111))
])

In [12]:
# Cross-validation to evaluate the model
cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='neg_log_loss')
mean_cv_score = -cv_scores.mean()


In [13]:
model.fit(X_train, y_train)

# Predict on the test set
y_pred_proba_ = model.predict_proba(X_test)[:, 1]


In [14]:
# Calculate Log Loss
log_loss = log_loss(y_test, y_pred_proba_)

mean_cv_score, log_loss

(0.34933756102851193, 0.34681276499448294)

In [15]:
y_test1 = model.predict_proba(X_test1)[:, 1]
final_sub= pd.DataFrame({'user_id': df_test['user_id'], 'CHURN': y_pred_proba_})
final_sub['CHURN']=final_sub['CHURN'].round(2)
final_sub.head(5)


Unnamed: 0,user_id,CHURN
0,af900d87e73b7ff6509d2203df4704a98aa5f2a6,0.42
1,5335efd940280b82143272275637d1e65d37eadb,0.01
2,a581f4fa08677c26f83f643248c667e241043086,0.04
3,64f67177d0775262b8087a9e2e3b8061b6324ae6,0.36
4,0d6009a4594c4be22449b8d9cc01a0bcea98faea,0.01


In [16]:
import os  
os.makedirs('Downloads/DSN_HACKATHON_SUBMISSION', exist_ok=True)
final_sub.to_csv('Downloads/DSN_HACKATHON_SUBMISSION.csv') 