In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Machine Learning libraries
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.cluster import KMeans

# Model persistence
import joblib
import pickle

# Interactive widgets
import ipywidgets as widgets
from IPython.display import display, HTML

# Dashboard
import dash
from dash import dcc, html
from dash.dependencies import Input, Output

# Set random seed for reproducibility
np.random.seed(42)

# Suppress warnings
import warnings
warnings.filterwarnings('ignore')


In [3]:
# Load and prepare the dataset
df = pd.read_csv('data/raw/International_Education_Costs.csv')

# Recreate the engineered features from Phase 2
def prepare_features(df):
    """Prepare features for the model including engineered features."""
    
    # Calculate TCA
    df['TCA'] = (df['Tuition_USD'] + 
                 (df['Rent_USD'] * 12) + 
                 df['Visa_Fee_USD'] + 
                 (df['Insurance_USD'] * df['Duration_Years']))
    
    # Create derived features
    df['Tuition_to_Rent_Ratio'] = df['Tuition_USD'] / df['Rent_USD']
    df['Total_Living_Cost'] = (df['Rent_USD'] * 12 + 
                              df['Insurance_USD'] * df['Duration_Years'])
    df['Cost_per_Year'] = df['TCA'] / df['Duration_Years']
    
    # Create affordability tiers
    df['Affordability_Tier'] = pd.qcut(df['TCA'], q=3, 
                                     labels=['Low', 'Medium', 'High'])
    
    return df

# Prepare the dataset
df = prepare_features(df)

print("Dataset prepared with the following features:")
print("\nNumerical features:")
print(df.select_dtypes(include=['int64', 'float64']).columns.tolist())
print("\nCategorical features:")
print(df.select_dtypes(include=['object']).columns.tolist())


Dataset prepared with the following features:

Numerical features:
['Duration_Years', 'Tuition_USD', 'Living_Cost_Index', 'Rent_USD', 'Visa_Fee_USD', 'Insurance_USD', 'Exchange_Rate', 'TCA', 'Tuition_to_Rent_Ratio', 'Total_Living_Cost', 'Cost_per_Year']

Categorical features:
['Country', 'City', 'University', 'Program', 'Level']


In [4]:
# Define model export utilities
class ModelExporter:
    """Utility class for exporting and loading models with metadata."""
    
    def __init__(self, model_dir='models'):
        """Initialize with model directory."""
        self.model_dir = model_dir
        os.makedirs(model_dir, exist_ok=True)
        
    def save_model(self, model, model_name, metadata=None):
        """Save model and its metadata."""
        # Create model path
        model_path = os.path.join(self.model_dir, f"{model_name}.joblib")
        meta_path = os.path.join(self.model_dir, f"{model_name}_metadata.json")
        
        # Save model
        joblib.dump(model, model_path)
        
        # Save metadata
        if metadata is None:
            metadata = {}
        metadata.update({
            'model_name': model_name,
            'created_at': pd.Timestamp.now().isoformat(),
            'model_type': type(model).__name__
        })
        with open(meta_path, 'w') as f:
            json.dump(metadata, f, indent=2)
            
    def load_model(self, model_name):
        """Load model and its metadata."""
        model_path = os.path.join(self.model_dir, f"{model_name}.joblib")
        meta_path = os.path.join(self.model_dir, f"{model_name}_metadata.json")
        
        # Load model
        model = joblib.load(model_path)
        
        # Load metadata
        with open(meta_path, 'r') as f:
            metadata = json.load(f)
            
        return model, metadata

# Create model directory
import os
import json
os.makedirs('models', exist_ok=True)

# Initialize model exporter
model_exporter = ModelExporter()

print("Model export utilities created successfully!")
print(f"Models will be saved in: {os.path.abspath('models')}")


Model export utilities created successfully!
Models will be saved in: /Users/frank/SDS-CP030-edu-spend/submissions/team-members/frank-brown/models


In [5]:
# Recreate and save our best models from Phase 2

# 1. TCA Prediction Model (Random Forest)
def create_tca_model():
    """Create and train the TCA prediction model."""
    # Define features
    categorical_features = ['Country', 'City', 'University', 'Program', 'Level']
    numerical_features = ['Duration_Years', 'Tuition_USD', 'Living_Cost_Index', 
                         'Rent_USD', 'Visa_Fee_USD', 'Insurance_USD', 'Exchange_Rate']
    
    # Create preprocessor
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numerical_features),
            ('cat', OneHotEncoder(drop='first', sparse_output=False, 
                                handle_unknown='ignore'), categorical_features)
        ])
    
    # Create pipeline
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
    ])
    
    # Prepare data
    X = df[numerical_features + categorical_features]
    y = df['TCA']
    
    # Train model
    pipeline.fit(X, y)
    
    # Save model
    metadata = {
        'features': {
            'numerical': numerical_features,
            'categorical': categorical_features
        },
        'target': 'TCA',
        'metrics': {
            'r2_score': r2_score(y, pipeline.predict(X))
        }
    }
    model_exporter.save_model(pipeline, 'tca_predictor', metadata)
    
    return pipeline, metadata

# Create and save TCA model
tca_model, tca_metadata = create_tca_model()
print("TCA Prediction Model saved successfully!")
print(f"Model performance (R² Score): {tca_metadata['metrics']['r2_score']:.3f}")


TCA Prediction Model saved successfully!
Model performance (R² Score): 1.000


In [6]:
# 2. Affordability Classification Model
def create_affordability_model():
    """Create and train the affordability classification model."""
    # Define features (excluding TCA to avoid data leakage)
    categorical_features = ['Country', 'City', 'University', 'Program', 'Level']
    numerical_features = ['Duration_Years', 'Tuition_USD', 'Living_Cost_Index', 
                         'Rent_USD', 'Visa_Fee_USD', 'Insurance_USD', 'Exchange_Rate',
                         'Tuition_to_Rent_Ratio', 'Total_Living_Cost', 'Cost_per_Year']
    
    # Create preprocessor
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numerical_features),
            ('cat', OneHotEncoder(drop='first', sparse_output=False, 
                                handle_unknown='ignore'), categorical_features)
        ])
    
    # Create pipeline
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
    ])
    
    # Prepare data
    X = df[numerical_features + categorical_features]
    y = df['Affordability_Tier']
    
    # Train model
    pipeline.fit(X, y)
    
    # Calculate accuracy
    accuracy = pipeline.score(X, y)
    
    # Save model
    metadata = {
        'features': {
            'numerical': numerical_features,
            'categorical': categorical_features
        },
        'target': 'Affordability_Tier',
        'classes': list(y.unique()),
        'metrics': {
            'accuracy': accuracy
        }
    }
    model_exporter.save_model(pipeline, 'affordability_classifier', metadata)
    
    return pipeline, metadata

# Create and save Affordability model
affordability_model, affordability_metadata = create_affordability_model()
print("Affordability Classification Model saved successfully!")
print(f"Model performance (Accuracy): {affordability_metadata['metrics']['accuracy']:.3f}")
print(f"Classes: {affordability_metadata['classes']}")


Affordability Classification Model saved successfully!
Model performance (Accuracy): 1.000
Classes: ['High', 'Medium', 'Low']


In [7]:
# Create the interactive prediction interface
class CostPredictionTool:
    """Interactive tool for education cost prediction."""
    
    def __init__(self, tca_model, affordability_model):
        self.tca_model = tca_model
        self.affordability_model = affordability_model
        self.setup_widgets()
        
    def setup_widgets(self):
        """Create input widgets."""
        # Get unique values for categorical fields
        self.countries = sorted(df['Country'].unique())
        self.programs = sorted(df['Program'].unique())
        self.levels = sorted(df['Level'].unique())
        
        # Create widgets
        self.country_widget = widgets.Dropdown(
            options=self.countries,
            description='Country:',
            style={'description_width': 'initial'}
        )
        
        self.program_widget = widgets.Dropdown(
            options=self.programs,
            description='Program:',
            style={'description_width': 'initial'}
        )
        
        self.level_widget = widgets.Dropdown(
            options=self.levels,
            description='Level:',
            style={'description_width': 'initial'}
        )
        
        self.duration_widget = widgets.FloatSlider(
            value=2.0,
            min=1.0,
            max=5.0,
            step=0.5,
            description='Duration (years):',
            style={'description_width': 'initial'}
        )
        
        self.tuition_widget = widgets.IntSlider(
            value=20000,
            min=500,
            max=100000,
            step=500,
            description='Tuition (USD):',
            style={'description_width': 'initial'}
        )
        
        self.predict_button = widgets.Button(
            description='Predict Costs',
            button_style='primary'
        )
        self.predict_button.on_click(self.make_prediction)
        
        self.output = widgets.Output()
        
    def display(self):
        """Display the interactive interface."""
        # Create layout
        input_box = widgets.VBox([
            self.country_widget,
            self.program_widget,
            self.level_widget,
            self.duration_widget,
            self.tuition_widget,
            self.predict_button
        ])
        
        # Display everything
        display(widgets.VBox([input_box, self.output]))
        
    def make_prediction(self, b):
        """Make predictions based on input values."""
        with self.output:
            self.output.clear_output()
            
            # Create input data
            input_data = pd.DataFrame({
                'Country': [self.country_widget.value],
                'Program': [self.program_widget.value],
                'Level': [self.level_widget.value],
                'Duration_Years': [self.duration_widget.value],
                'Tuition_USD': [self.tuition_widget.value],
                # Add other required features with reasonable defaults
                'Living_Cost_Index': [df[df['Country'] == self.country_widget.value]['Living_Cost_Index'].mean()],
                'Rent_USD': [df[df['Country'] == self.country_widget.value]['Rent_USD'].mean()],
                'Visa_Fee_USD': [df[df['Country'] == self.country_widget.value]['Visa_Fee_USD'].mean()],
                'Insurance_USD': [df[df['Country'] == self.country_widget.value]['Insurance_USD'].mean()],
                'Exchange_Rate': [df[df['Country'] == self.country_widget.value]['Exchange_Rate'].mean()]
            })
            
            # Make predictions
            tca_pred = self.tca_model.predict(input_data)[0]
            affordability = self.affordability_model.predict(input_data)[0]
            
            # Display results
            print(f"Estimated Total Cost of Attendance: ${tca_pred:,.2f}")
            print(f"Affordability Classification: {affordability}")
            
            # Create visualization
            self.plot_cost_breakdown(tca_pred, input_data)
            
    def plot_cost_breakdown(self, tca, input_data):
        """Create a visual breakdown of costs."""
        # Calculate cost components
        tuition = input_data['Tuition_USD'].values[0]
        duration = input_data['Duration_Years'].values[0]
        rent = input_data['Rent_USD'].values[0] * 12 * duration
        insurance = input_data['Insurance_USD'].values[0] * duration
        visa = input_data['Visa_Fee_USD'].values[0]
        
        # Create pie chart
        fig = go.Figure(data=[go.Pie(
            labels=['Tuition', 'Rent', 'Insurance', 'Visa'],
            values=[tuition, rent, insurance, visa],
            hole=.3
        )])
        
        fig.update_layout(
            title="Cost Breakdown",
            showlegend=True,
            width=600,
            height=400
        )
        
        fig.show()

# Create and display the prediction tool
prediction_tool = CostPredictionTool(tca_model, affordability_model)
prediction_tool.display()


VBox(children=(VBox(children=(Dropdown(description='Country:', options=('Algeria', 'Argentina', 'Australia', '…

In [None]:
# Create the Dash application
app = dash.Dash(__name__)

# Define the layout
app.layout = html.Div([
    # Header
    html.H1('Education Cost Analysis Dashboard'),
    
    # Filters
    html.Div([
        html.H3('Filters'),
        dcc.Dropdown(
            id='country-filter',
            options=[{'label': c, 'value': c} for c in sorted(df['Country'].unique())],
            placeholder='Select Country'
        ),
        dcc.Dropdown(
            id='program-filter',
            options=[{'label': p, 'value': p} for p in sorted(df['Program'].unique())],
            placeholder='Select Program'
        ),
        dcc.Dropdown(
            id='level-filter',
            options=[{'label': l, 'value': l} for l in sorted(df['Level'].unique())],
            placeholder='Select Level'
        )
    ], style={'padding': '20px'}),
    
    # Cost Distribution
    html.Div([
        html.H3('Cost Distribution'),
        dcc.Graph(id='cost-distribution')
    ]),
    
    # Geographic View
    html.Div([
        html.H3('Geographic Cost Overview'),
        dcc.Graph(id='geo-visualization')
    ]),
    
    # Program Comparison
    html.Div([
        html.H3('Program Comparison'),
        dcc.Graph(id='program-comparison')
    ])
])

# Define callbacks
@app.callback(
    [Output('cost-distribution', 'figure'),
     Output('geo-visualization', 'figure'),
     Output('program-comparison', 'figure')],
    [Input('country-filter', 'value'),
     Input('program-filter', 'value'),
     Input('level-filter', 'value')]
)
def update_graphs(country, program, level):
    # Filter data
    filtered_df = df.copy()
    if country:
        filtered_df = filtered_df[filtered_df['Country'] == country]
    if program:
        filtered_df = filtered_df[filtered_df['Program'] == program]
    if level:
        filtered_df = filtered_df[filtered_df['Level'] == level]
    
    # Create cost distribution figure
    cost_dist = px.box(filtered_df, 
                      y='TCA',
                      color='Level',
                      title='Total Cost Distribution')
    
    # Create geographic visualization
    geo_viz = px.scatter_geo(filtered_df,
                            locations='Country',
                            locationmode='country names',
                            size='TCA',
                            color='Affordability_Tier',
                            title='Global Cost Overview')
    
    # Create program comparison
    prog_comp = px.bar(filtered_df.groupby('Program')['TCA'].mean().reset_index(),
                      x='Program',
                      y='TCA',
                      title='Average Cost by Program')
    
    return cost_dist, geo_viz, prog_comp

# Note: To run the dashboard, uncomment the following line:
app.run_server(debug=True, port=8050)

print("Dashboard created successfully!")
print("To launch the dashboard, run: app.run_server(debug=True, port=8050)")
print("Then open a web browser and navigate to: http://localhost:8050")
