<a href="https://colab.research.google.com/github/coombesmatthew/NYC_Taxi_Data/blob/main/M1_Retention_Factors.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Main Question: What factors impact M1 Retention for First Time Users?

In [18]:
# Import Necessary Packages

import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
import numpy as np

In [19]:
# Load your dataset
df = pd.read_csv("/content/test_sample.csv")

df.head()

Unnamed: 0,user_id,city,Return_in_30_Days,Restaurant_Name,Delivery_Fee,Items,Missing_Item,Order_Value,Total_Discount,Completion_Time
0,6154359,Eastern Province,0,Baskin Robbins,49.5,2,0,35.07,0.0,21.15
1,6154381,Hafr Al Batin,0,Herfy,27.5,2,0,50.1,0.0,18.9
2,3984563,Jeddah,0,Burger King,0.0,16,0,88.51,0.0,12.6
3,6154448,Riyadh,0,SWL,0.0,6,0,67.969,0.0,13.5
4,6091932,Riyadh,1,Shaikh Al Kabsa,0.0,2,0,66.8,0.0,26.1


In [26]:
# Check for any invalid entries

# Example: Only rows where 'Order_Value' is missing
df[df['Order_Value'].isna()]

# Drop rows with any NaN values
df_cleaned = df.dropna()

print(df_cleaned)


        user_id              city  Return_in_30_Days  \
0       6154359  Eastern Province                  0   
1       6154381     Hafr Al Batin                  0   
2       3984563            Jeddah                  0   
3       6154448            Riyadh                  0   
4       6091932            Riyadh                  1   
...         ...               ...                ...   
299995  3903728           Al Ahsa                  0   
299996  4108926              Abha                  1   
299997  6879059            Riyadh                  1   
299998  6877634            Riyadh                  1   
299999  5546623             Mecca                  1   

                         Restaurant_Name  Delivery_Fee  Items  Missing_Item  \
0                         Baskin Robbins          49.5      2             0   
1                                  Herfy          27.5      2             0   
2                            Burger King           0.0     16             0   
3          

In [45]:
# Set X and Y
X = df_cleaned.drop(columns=['user_id', 'Return_in_30_Days'])  # Drop columns that have no impact on results i.e. thing you are testing for and any identity (e.g. user_id, order_id)
y = df_cleaned['Return_in_30_Days']

# Define categorical and numerical features
categorical_features = ['city']
numerical_features = ['Delivery_Fee', 'Items', 'Missing_Item', 'Order_Value', 'Total_Discount', 'Completion_Time']

# Build preprocessor (scales numerics and one-hot encodes categoricals)
preprocessor = ColumnTransformer(transformers=[
    ('num', StandardScaler(), numerical_features),
    ('cat', OneHotEncoder(drop='first'), categorical_features)
])


In [47]:
# Create full pipeline with logistic regression
model = make_pipeline(preprocessor, LogisticRegression(max_iter=1000, C=1.0))

# Fit on the entire dataset (exploratory only)
model.fit(X, y)

# Get feature names after preprocessing
feature_names_num = numerical_features
encoder = model.named_steps['columntransformer'].named_transformers_['cat']
feature_names_cat = encoder.get_feature_names_out(categorical_features)
all_features = np.concatenate([feature_names_num, feature_names_cat])

# Get coefficients and convert to odds ratios
logreg = model.named_steps['logisticregression']
odds_ratios = pd.DataFrame({
    'Feature': all_features,
    'Coefficient': logreg.coef_[0],
    'Odds Ratio': np.exp(logreg.coef_[0])
}).sort_values(by='Odds Ratio', ascending=False)

# Display results
print("\nFeature importances (odds ratios):")
print(odds_ratios)

odds_ratios.head()



Feature importances (odds ratios):
                  Feature  Coefficient  Odds Ratio
13            city_Jubail     0.324731    1.383658
21           city_Unaizah     0.100646    1.105885
10     city_Hafr Al Batin     0.099010    1.104077
19             city_Tabuk     0.056023    1.057622
9   city_Eastern Province     0.042155    1.043056
11              city_Hail     0.013725    1.013819
1                   Items     0.010981    1.011041
2            Missing_Item     0.000000    1.000000
17            city_Rabigh    -0.001406    0.998595
14    city_Khamis Mushait    -0.008336    0.991699
6            city_Al Ahsa    -0.014910    0.985201
20              city_Taif    -0.016975    0.983168
8          city_Burayadah    -0.028259    0.972136
4          Total_Discount    -0.055014    0.946472
3             Order_Value    -0.055081    0.946408
22             city_Yanbu    -0.055879    0.945654
5         Completion_Time    -0.089338    0.914536
0            Delivery_Fee    -0.141736    0.86

Unnamed: 0,Feature,Coefficient,Odds Ratio
13,city_Jubail,0.324731,1.383658
21,city_Unaizah,0.100646,1.105885
10,city_Hafr Al Batin,0.09901,1.104077
19,city_Tabuk,0.056023,1.057622
9,city_Eastern Province,0.042155,1.043056
