In [10]:
import pandas as pd
data = pd.read_csv("AI_Cleaned.csv", encoding="latin1")
print(data.columns.tolist())


['learner_signup_datetime', 'opportunity_id', 'opportunity_name', 'opportunity_category', 'opportunity_end_date', 'gender', 'country', 'entry_created_at', 'status_description', 'status_code', 'apply_date', 'opportunity_start_date', 'start_date_missing', 'age', 'age_band', 'apply_lag', 'start_lag', 'opportunity_duration', 'signup_cohort', 'completion_flag', 'stage_reached', 'institution_name']


In [11]:
# -------------------------------
# COLLABORATIVE FILTERING (Enhanced)
# -------------------------------

# Create a pivot table: institution vs. opportunity_category
# Each cell = average completion rate for that category at that institution
pivot_df = data.pivot_table(
    index='institution_name',
    columns='opportunity_category',
    values='completion_flag',
    aggfunc='mean'
).fillna(0)

# Compute cosine similarity between institutions
from sklearn.metrics.pairwise import cosine_similarity
inst_similarity = cosine_similarity(pivot_df)
inst_sim_df = pd.DataFrame(inst_similarity, index=pivot_df.index, columns=pivot_df.index)

# Function: Recommend similar institutions and their high-performing categories
def recommend_institution_patterns(institution_name, top_n=3):
    if institution_name not in inst_sim_df.index:
        return f"Institution '{institution_name}' not found in dataset."
    
    # Step 1: Get top similar institutions
    similar_institutions = inst_sim_df[institution_name].sort_values(ascending=False).index[1:top_n+1]
    
    # Step 2: For each similar institution, get their top-performing opportunity categories
    recommendations = []
    for inst in similar_institutions:
        top_categories = pivot_df.loc[inst].sort_values(ascending=False).head(2).index.tolist()
        avg_success = pivot_df.loc[inst, top_categories].mean()
        recommendations.append({
            'Reference Institution': institution_name,
            'Similar Institution': inst,
            'Recommended Categories': ', '.join(top_categories),
            'Avg Completion Rate': round(avg_success, 2)
        })
    
    return pd.DataFrame(recommendations)

# Example usage
sample_institution = data['institution_name'].iloc[0]
print(f"\nðŸ‘¥ Collaborative Recommendations for '{sample_institution}':")
collab_results = recommend_institution_patterns(sample_institution)
print(collab_results)



ðŸ‘¥ Collaborative Recommendations for 'Zhuhai No.1 High School International Department':
                              Reference Institution  \
0  Zhuhai No.1 High School International Department   
1  Zhuhai No.1 High School International Department   
2  Zhuhai No.1 High School International Department   

              Similar Institution   Recommended Categories  \
0                    NIT-Agartala  Internship, Competition   
1  Nuhu Bamalli Polytechnic Zaria  Internship, Competition   
2    North maharashtra university  Internship, Competition   

   Avg Completion Rate  
0                  0.5  
1                  0.5  
2                  0.5  


In [13]:
# -------------------------------
# CONTENT-BASED FILTERING
# -------------------------------

# Make a copy and keep the original category for reference
content_df = data.copy()

# Save category names separately before encoding
original_categories = content_df[['opportunity_name', 'opportunity_category', 'opportunity_duration', 'institution_name']]

# Encode the opportunity category
content_encoded = pd.get_dummies(content_df, columns=['opportunity_category'], drop_first=True)

# Select features for similarity
content_features = ['opportunity_duration', 'apply_lag', 'start_lag'] + \
                   [col for col in content_encoded.columns if col.startswith('opportunity_category_')]

# Normalize
scaler = StandardScaler()
scaled_features = scaler.fit_transform(content_encoded[content_features])

# Compute cosine similarity
similarity_matrix = cosine_similarity(scaled_features)
similarity_df = pd.DataFrame(similarity_matrix, index=content_encoded.index, columns=content_encoded.index)

# Function to recommend similar opportunities
def recommend_opportunities(index, top_n=3):
    similar_indices = similarity_df[index].sort_values(ascending=False).index[1:top_n+1]
    # pull from original (non-encoded) columns for readability
    return data.loc[similar_indices, ['opportunity_name', 'opportunity_category', 'opportunity_duration', 'institution_name']]

# Example usage
print("\nðŸŽ¯ Content-Based Recommendations for the first opportunity:")
print(recommend_opportunities(0))


ValueError: Input contains NaN.