# Activity: Building a Full-Stack Web Application - Churn Prediction Dashboard

In [1]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
import pandas as pd

In [2]:
# Load customer data
### YOUR CODE HERE ### Step 1.3
customer_data = pd.read_csv("customer_data.csv")
# Display the first few rows of the dataset
customer_data.head()

Unnamed: 0.1,Unnamed: 0,churn,tenure,contract_renewal,data_plan,data_usage,cust_serv_calls,day_mins,DayCalls,monthly_charges,overage_fee,roam_minutes,state
0,0,0,128,Yes,1,2.7,1,265.1,110,89.0,9.87,10.0,NH
1,1,0,107,Yes,1,3.7,1,161.6,123,82.0,9.78,13.7,ME
2,2,0,137,Yes,0,0.0,0,243.4,114,52.0,6.06,12.2,ME
3,3,0,84,No,0,0.0,2,299.4,71,57.0,3.1,6.6,ME
4,4,0,75,No,0,0.0,3,166.7,113,41.0,7.42,10.1,ME


In [3]:
customer_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3333 entries, 0 to 3332
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Unnamed: 0        3333 non-null   int64  
 1   churn             3333 non-null   int64  
 2   tenure            3333 non-null   int64  
 3   contract_renewal  3333 non-null   object 
 4   data_plan         3333 non-null   int64  
 5   data_usage        3333 non-null   float64
 6   cust_serv_calls   3333 non-null   int64  
 7   day_mins          3333 non-null   float64
 8   DayCalls          3333 non-null   int64  
 9   monthly_charges   3333 non-null   float64
 10  overage_fee       3333 non-null   float64
 11  roam_minutes      3333 non-null   float64
 12  state             3333 non-null   object 
dtypes: float64(5), int64(6), object(2)
memory usage: 338.6+ KB


In [4]:
customer_data.isnull().sum()

Unnamed: 0          0
churn               0
tenure              0
contract_renewal    0
data_plan           0
data_usage          0
cust_serv_calls     0
day_mins            0
DayCalls            0
monthly_charges     0
overage_fee         0
roam_minutes        0
state               0
dtype: int64

In [5]:
customer_data['tenure'].mean()

101.06480648064806

In [6]:
customer_data['tenure'] = customer_data['tenure'].fillna(value=101)

In [7]:
customer_data['state']

0       NH
1       ME
2       ME
3       ME
4       ME
        ..
3328    NH
3329    VT
3330    VT
3331    NH
3332    VT
Name: state, Length: 3333, dtype: object

In [8]:
state_encoder = LabelEncoder()

In [9]:
customer_data['encoded_state'] = state_encoder.fit_transform(customer_data['state'])

In [10]:
customer_data['encoded_state']

0       1
1       0
2       0
3       0
4       0
       ..
3328    1
3329    2
3330    2
3331    1
3332    2
Name: encoded_state, Length: 3333, dtype: int32

In [11]:
customer_data['encoded_state'].value_counts()

encoded_state
2    1138
0    1104
1    1091
Name: count, dtype: int64

In [12]:
customer_data.head()

Unnamed: 0.1,Unnamed: 0,churn,tenure,contract_renewal,data_plan,data_usage,cust_serv_calls,day_mins,DayCalls,monthly_charges,overage_fee,roam_minutes,state,encoded_state
0,0,0,128,Yes,1,2.7,1,265.1,110,89.0,9.87,10.0,NH,1
1,1,0,107,Yes,1,3.7,1,161.6,123,82.0,9.78,13.7,ME,0
2,2,0,137,Yes,0,0.0,0,243.4,114,52.0,6.06,12.2,ME,0
3,3,0,84,No,0,0.0,2,299.4,71,57.0,3.1,6.6,ME,0
4,4,0,75,No,0,0.0,3,166.7,113,41.0,7.42,10.1,ME,0


In [13]:
# Scale numerical columns

scaler = StandardScaler()

In [14]:
columns_to_scale = ['tenure', 'monthly_charges']
customer_data[columns_to_scale] = scaler.fit_transform(customer_data[columns_to_scale])

In [15]:
customer_data.head()

Unnamed: 0.1,Unnamed: 0,churn,tenure,contract_renewal,data_plan,data_usage,cust_serv_calls,day_mins,DayCalls,monthly_charges,overage_fee,roam_minutes,state,encoded_state
0,0,0,0.676489,Yes,1,2.7,1,265.1,110,1.990727,9.87,10.0,NH,1
1,1,0,0.149065,Yes,1,3.7,1,161.6,123,1.56451,9.78,13.7,ME,0
2,2,0,0.902529,Yes,0,0.0,0,243.4,114,-0.262133,6.06,12.2,ME,0
3,3,0,-0.42859,No,0,0.0,2,299.4,71,0.042307,3.1,6.6,ME,0
4,4,0,-0.654629,No,0,0.0,3,166.7,113,-0.931902,7.42,10.1,ME,0


In [16]:
# Split the data into features and target
x = customer_data[['tenure', 'monthly_charges', 'encoded_state']]
y = customer_data['churn']

In [17]:
x.shape, y.shape

((3333, 3), (3333,))

In [18]:
# Split into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [19]:
x_train.shape, x_test.shape

((2666, 3), (667, 3))

In [20]:
# Train the logistic regression model
model = LogisticRegression()

In [21]:
model.fit(x_train,y_train)

In [22]:
y_pred = model.predict(x_test)

In [23]:
y_pred

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [24]:
# Evaluate the model
print(f"Model Accuracy: {accuracy_score(y_test, y_pred)}")

Model Accuracy: 0.848575712143928


In [25]:
# Calculate average churn probability
churn_probabilities = model.predict_proba(x_test)[:, 1]
### YOUR CODE HERE ### Step 3.1
avg_churn_prob = churn_probabilities.mean()

In [26]:
churn_probabilities

array([0.15094786, 0.11431128, 0.099929  , 0.14313195, 0.13714638,
       0.17337833, 0.13155349, 0.17629888, 0.14846443, 0.1278886 ,
       0.13232984, 0.09548923, 0.13857821, 0.15964076, 0.19920335,
       0.13591544, 0.13082817, 0.12976015, 0.14972118, 0.13642771,
       0.13148857, 0.11802335, 0.16635784, 0.15308418, 0.11230906,
       0.15372282, 0.15682605, 0.15792152, 0.18351529, 0.14186037,
       0.14329056, 0.12694126, 0.12057072, 0.13049295, 0.13756468,
       0.16391589, 0.10972293, 0.15047285, 0.10288642, 0.17886854,
       0.12934006, 0.19840634, 0.15925769, 0.1417917 , 0.14522064,
       0.136808  , 0.1682633 , 0.21267235, 0.14566219, 0.12976483,
       0.12942623, 0.14219551, 0.10633269, 0.17685584, 0.10488176,
       0.13582242, 0.11485336, 0.12985846, 0.13045691, 0.16264129,
       0.17345266, 0.14769828, 0.15688585, 0.18103696, 0.12771631,
       0.13983685, 0.13894319, 0.23178495, 0.14689068, 0.14719765,
       0.1499262 , 0.11749841, 0.11947328, 0.1412016 , 0.12408

In [27]:
avg_churn_prob

0.1451703414264648

In [28]:
# Identify high-risk customers (e.g., those with a churn probability above a threshold)
### YOUR CODE HERE ### Step 3.2
high_risk_customers = (churn_probabilities > avg_churn_prob).sum()

In [29]:
high_risk_customers

293

In [30]:
def calculate_churn_and_high_risk(churn_series, avg_churn_prob):
    """Calculate churn rate and count high-risk customers."""
    ### YOUR CODE HERE ### Step 3.3
    churn_rate = churn_series.mean() 
    high_risk_mask = (churn_series > avg_churn_prob)

    ### YOUR CODE HERE ### Step 3.4
    high_risk_count = high_risk_mask.sum()
    return churn_rate, high_risk_count

In [31]:
def calc_data():

    # Create lists to store churn rates and high-risk counts by state
    churn_rate_by_state = []
    high_risk_by_state = []

    # Group by state and calculate churn rate and high-risk count
    for state, group in customer_data.groupby('state'):
        # Calculate the churn for the state
        churn_rate, high_risk_count = calculate_churn_and_high_risk(group['churn'], avg_churn_prob)
        # Append the state and churn rate to the list
        churn_rate_by_state.append({'state': state, 'churn_rate': churn_rate})
        # Append the state and high risk count to the list
        high_risk_by_state.append({'state': state, 'high_risk': high_risk_count})

    # Convert lists to DataFrames
    churn_rate_by_state_df = pd.DataFrame(churn_rate_by_state)
    high_risk_by_state_df = pd.DataFrame(high_risk_by_state)

    print("Results of your analysis for reference:")
    print(f"Average Churn Probability: {avg_churn_prob}")
    print(f"High-Risk Customers: {high_risk_customers}")
    print(f"Churn Rate by State:\n {churn_rate_by_state_df}")
    print(f"High-Risk Customers by State:\n {high_risk_by_state_df}")

    # Store the results in a text file for autograding. Do not modify this code.
    with open('churn_results.txt', 'w') as f:
        f.write("Do not modify this file. It is used for autograding the processed data from the lab.\n\n")
        f.write(f"Average Churn Probability: {avg_churn_prob}\n\n")
        f.write(f"High-Risk Customers: {high_risk_customers}\n\n")
        f.write(f"Churn Rate by State:\n {churn_rate_by_state_df}\n\n")
        f.write(f"High-Risk Customers by State:\n {high_risk_by_state_df}")

    return avg_churn_prob, high_risk_customers, churn_rate_by_state_df, high_risk_by_state_df