In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score

# Step 1: Load and preprocess data
outcomes = pd.read_csv('outcomes.csv')
projects = pd.read_csv('projects.csv')

# Map 't' → 1 and 'f' → 0 in 'fully_funded'
outcomes['fully_funded'] = outcomes['fully_funded'].map({'t': 1, 'f': 0})
outcomes['not_fully_funded'] = 1 - outcomes['fully_funded']  # Alternative to lambda

# Merge datasets and filter by date
projects['date_posted'] = pd.to_datetime(projects['date_posted'])
data = pd.merge(projects, outcomes[['projectid', 'not_fully_funded']], on='projectid')
data = data[data['date_posted'] >= '2010-01-01']

# Confirm the conversion
print(outcomes['not_fully_funded'].value_counts())
print(outcomes[['projectid','is_exciting', 'not_fully_funded']].head())

# Add new time-related features 
data['year_posted'] = data['date_posted'].dt.year
data['month_posted'] = data['date_posted'].dt.month
data['days_since_posted'] = (pd.Timestamp.now() - data['date_posted']).dt.days

# Add interaction features
data['state_poverty_interaction'] = data['school_state'] + '_' + data['poverty_level']

not_fully_funded
0    430683
1    188643
Name: count, dtype: int64
                          projectid is_exciting  not_fully_funded
0  ffffc4f85b60efc5b52347df489d0238           f                 1
1  ffffac55ee02a49d1abc87ba6fc61135           f                 0
2  ffff97ed93720407d70a2787475932b0           f                 0
3  ffff418bb42fad24347527ad96100f81           f                 1
4  ffff2d9c769c8fb5335e949c615425eb           t                 0


In [2]:
# Step 2: Minimal feature matrix
features = [
    'primary_focus_subject', 
    'school_state', 
    'resource_type', 
    'poverty_level',
    'year_posted',
    'month_posted',
    'days_since_posted',
    'state_poverty_interaction'
]
X = pd.get_dummies(data[features], drop_first=True)
y = data['not_fully_funded']

# Step 3: Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 4: Train model and predict probabilities
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_proba = rf.predict_proba(X_test)[:, 1]

In [3]:
# Step 5: Rank probabilities and calculate precision
results_df = pd.DataFrame({'score': y_proba})
results_df['rank'] = results_df['score'].rank(method='first', ascending=False)

# Label top 10% as 1 and calculate precision
top_10_percent_cutoff = int(len(results_df) * 0.1)
results_df['prediction'] = (results_df['rank'] <= top_10_percent_cutoff).astype(int)
precision = precision_score(y_test, results_df['prediction'])

# Output results
print(f"Precision for top 10%: {precision}")
print(results_df.head(15))

Precision for top 10%: 0.4628070973612375
       score     rank  prediction
0   0.000000  80825.0           0
1   0.000000  80826.0           0
2   0.270000  36610.0           0
3   0.080000  60262.0           0
4   0.029167  72212.0           0
5   0.395000  27039.0           0
6   0.310000  32932.0           0
7   0.010000  76453.0           0
8   0.022500  72914.0           0
9   0.020000  73103.0           0
10  0.070000  62035.0           0
11  0.453667  23634.0           0
12  0.760000   6918.0           1
13  0.000000  80827.0           0
14  0.350000  29756.0           0
