In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score

# Step 1: Load and preprocess data
outcomes = pd.read_csv('outcomes.csv')
projects = pd.read_csv('projects.csv')
donations = pd.read_csv('donations.csv')

# Safely convert 'donation_timestamp' to datetime with mixed format handling
donations['donation_timestamp'] = pd.to_datetime(
    donations['donation_timestamp'], 
    errors='coerce'  # Coerce invalid formats to NaT
)
# Optionally, drop rows with invalid timestamps
donations = donations.dropna(subset=['donation_timestamp'])

# Map 't' → 1 and 'f' → 0 in 'fully_funded'
outcomes['fully_funded'] = outcomes['fully_funded'].map({'t': 1, 'f': 0})
outcomes['not_fully_funded'] = outcomes['fully_funded'].apply(lambda x: 0 if x == 1 else 1)

# Confirm the conversion
print(outcomes['not_fully_funded'].value_counts())
print(outcomes[['projectid','is_exciting', 'not_fully_funded']].head())


chunk_size = 5000  # Example chunk size
result_chunks = []
for start_row in range(0, len(projects), chunk_size):
    chunk = projects.iloc[start_row:start_row + chunk_size]
    merged_chunk = chunk.merge(outcomes, on='projectid', how='inner').merge(
        donations[['projectid', 'donation_timestamp', 'donation_total']], on='projectid', how='inner'
    )
    result_chunks.append(merged_chunk)

# Combine all chunks into a single DataFrame
data = pd.concat(result_chunks, ignore_index=True)

# Filter by date
data = data[data['donation_timestamp'] >= '2010-01-01']

# Add interaction features
data['subject_resource_interaction'] = data['primary_focus_subject'] + '_' + data['resource_type']
data['state_poverty_interaction'] = data['school_state'] + '_' + data['poverty_level']
data['donation_per_student'] = data['donation_total'] / (data['students_reached'] + 1)  # Avoid division by zero
data['state_resource_interaction'] = data['school_state'] + '_' + data['resource_type']

In [None]:
# Step 2: Minimal feature matrix
features = [
    'primary_focus_subject', 
    'primary_focus_area',
    'school_state', 
    'resource_type', 
    'poverty_level',
    'students_reached',
    'donation_total',
    'subject_resource_interaction',
    'state_poverty_interaction',
    'donation_per_student',
    'state_resource_interaction'
]

X_sample = pd.get_dummies(data[features], drop_first=True, dtype='uint8')
y_sample = data['not_fully_funded']

# Step 3: Train/test split
X_train, X_test, y_train, y_test = train_test_split(X_sample, y_sample, test_size=0.2, stratify=y_sample, random_state=42)

# Step 4: Train model and predict probabilities
rf = RandomForestClassifier(n_estimators=100, max_depth=15, can_jobs=-1, random_state=42)
rf.fit(X_train, y_train)
y_proba = rf.predict_proba(X_test)[:, 1]

In [None]:
# Step 5: Rank probabilities and calculate precision
results_df = pd.DataFrame({
    'projectid': data.loc[X_test.index, 'projectid'], # Match project ID from data_sample
    'score': y_proba # Predicted probability of not getting funded
})
# Sort by score in descending order
results_df = results_df.sort_values(by='score', ascending=False)

# Assign rank based on score order 
results_df['rank'] = results_df['score'].rank(method='first', ascending=False)

# Label top 10% as needing review 
threshold = results_df['score'].quantile(0.90)  # Top 10% based on actual probability distribution
results_df['prediction'] = (results_df['score'] >= threshold).astype(int)

# Compute precision score 
precision = precision_score(y_test, results_df['prediction'])

# Output results
print(f"Precision for top 10%: {precision}")
print(results_df.head(15))

In [None]:
#import matplotlib.pyplot as plt

#importance = rf.feature_importances_
#plt.figure(figsize=(12, 10))  # Increase figure size for clarity
#plt.barh(X_train.columns, importance)  # Plot feature importance
#plt.xticks(rotation=45, fontsize=10)  # Rotate text to prevent overlap
#plt.ylabel("Features")  # Label for Y-axis
#plt.xlabel("Importance Score")  # Label for X-axis
#plt.title("Feature Importance")  # Add a meaningful title
#plt.show()


#top_features = pd.Series(importance, index=X_train.columns).nlargest(20)  # Show top 20 features
#top_features.plot(kind='barh', figsize=(12, 8))
#plt.title("Top 20 Most Important Features")
#plt.show()