In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import precision_score

# Load outcomes.csv
outcomes = pd.read_csv('outcomes.csv')
projects = pd.read_csv('projects.csv')

# Convert 't' (True) → 1, 'f' (False) → 0
outcomes['fully_funded'] = outcomes['fully_funded'].map({'t': 1, 'f': 0})
outcomes['not_fully_funded'] = outcomes['fully_funded'].apply(lambda x: 0 if x == 1 else 1)
projects['date_posted'] = pd.to_datetime(projects['date_posted'])

# Confirm the conversion
print(outcomes['not_fully_funded'].value_counts())
print(outcomes[['projectid','is_exciting', 'not_fully_funded']].head())

# Merge datasets
data = pd.merge(projects, outcomes[['projectid', 'not_fully_funded']], on='projectid')

# sort by score desc
# rank / if rank is 10% rank is top 10% or 0
# score to rank to prediction 

# Function to train and evaluate models
def train_evaluate_model(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    precision = precision_score(y_test, y_pred)
    return precision

not_fully_funded
0    430683
1    188643
Name: count, dtype: int64
                          projectid is_exciting  not_fully_funded
0  ffffc4f85b60efc5b52347df489d0238           f                 1
1  ffffac55ee02a49d1abc87ba6fc61135           f                 0
2  ffff97ed93720407d70a2787475932b0           f                 0
3  ffff418bb42fad24347527ad96100f81           f                 1
4  ffff2d9c769c8fb5335e949c615425eb           t                 0


In [2]:
# Load data and preprocess (as in your previous scripts)
features_list = ['primary_focus_subject', 'school_state', 'resource_type', 'poverty_level']

precision_scores = []
for i in range(1, len(features_list) + 1):
    features = features_list[:i]
    X = pd.get_dummies(data[features], drop_first=True)
    y = data['not_fully_funded']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Use Logistic Regression for small feature sets
    if i <= 2:
        model = LogisticRegression(class_weight='balanced', max_iter=1000, random_state=42)
    else:
        # Use Random Forest for larger feature sets
        model = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42)
    
    # Evaluate model
    precision = train_evaluate_model(model, X_train, X_test, y_train, y_test)
    precision_scores.append((features, precision))
    print(f"Features: {features} -> Precision: {precision}")

# Output results
for features, precision in precision_scores:
    print(f"Features: {features} -> Precision: {precision}")

Features: ['primary_focus_subject'] -> Precision: 0.3250726118760757
Features: ['primary_focus_subject', 'school_state'] -> Precision: 0.3678955181676799
Features: ['primary_focus_subject', 'school_state', 'resource_type'] -> Precision: 0.3791433649095919
Features: ['primary_focus_subject', 'school_state', 'resource_type', 'poverty_level'] -> Precision: 0.38867553965665386

Feature Precision Results:
Features: ['primary_focus_subject'] -> Precision: 0.3250726118760757
Features: ['primary_focus_subject', 'school_state'] -> Precision: 0.3678955181676799
Features: ['primary_focus_subject', 'school_state', 'resource_type'] -> Precision: 0.3791433649095919
Features: ['primary_focus_subject', 'school_state', 'resource_type', 'poverty_level'] -> Precision: 0.38867553965665386
