In [3]:
import pandas as pd

# Load outcomes.csv
outcomes = pd.read_csv('outcomes.csv')

# Convert 't' (True) → 1, 'f' (False) → 0
outcomes['fully_funded'] = outcomes['fully_funded'].map({'t': 1, 'f': 0})
outcomes['not_fully_funded'] = outcomes['fully_funded'].apply(lambda x: 0 if x == 1 else 1)

# Confirm the conversion
print(outcomes['not_fully_funded'].value_counts())
print(outcomes[['projectid','is_exciting', 'not_fully_funded']].head())

not_fully_funded
0    430683
1    188643
Name: count, dtype: int64
                          projectid is_exciting  not_fully_funded
0  ffffc4f85b60efc5b52347df489d0238           f                 1
1  ffffac55ee02a49d1abc87ba6fc61135           f                 0
2  ffff97ed93720407d70a2787475932b0           f                 0
3  ffff418bb42fad24347527ad96100f81           f                 1
4  ffff2d9c769c8fb5335e949c615425eb           t                 0


In [5]:
projects = pd.read_csv('projects.csv')
data = pd.merge(projects, outcomes[['projectid', 'not_fully_funded']], on='projectid')


In [6]:
features = ['primary_focus_subject', 'school_state', 'resource_type', 'poverty_level']  # example
X = data[features]
y = data['not_fully_funded']


In [7]:
X = pd.get_dummies(X, drop_first=True)


In [11]:
from sklearn.model_selection import train_test_split

# Convert and filter by date
data['date_posted'] = pd.to_datetime(data['date_posted'])
data = data[data['date_posted'] >= '2010-01-01']

# Sort chronologically
data = data.sort_values('date_posted')

# Temporal 80/20 split
cutoff = int(len(data) * 0.8)
train = data.iloc[:cutoff]
test = data.iloc[cutoff:]

# Extract features and target
X_train = train[features]
y_train = train['not_fully_funded']
X_test = test[features]
y_test = test['not_fully_funded']



In [13]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_validate

# Step 1: Instantiate the model
model = RandomForestClassifier(class_weight='balanced', random_state=42)

# Step 2: Define metrics
scoring = ['accuracy', 'precision']

# Step 3: Run 5-fold cross-validation
cv_results = cross_validate(model, X, y, cv=5, scoring=scoring)

# Step 4: Output results
print("Cross-validation accuracy scores:", cv_results['test_accuracy'])
print("Average accuracy:", cv_results['test_accuracy'].mean())

print("Cross-validation precision scores:", cv_results['test_precision'])
print("Average precision:", cv_results['test_precision'].mean())


Cross-validation accuracy scores: [0.55082912 0.5678521  0.57615953 0.59639123 0.60850119]
Average accuracy: 0.5799466341111341
Cross-validation precision scores: [0.35634987 0.36165108 0.37281948 0.39147671 0.4013563 ]
Average precision: 0.37673068863406584


In [14]:
# 1. Predict probabilities
y_probs = model.predict_proba(X_test)[:, 1]  # probability of class 1

# 2. Apply a custom threshold (e.g., 0.7 instead of 0.5)
y_pred_custom = (y_probs >= 0.7).astype(int)

# 3. Evaluate
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred_custom))


NotFittedError: This RandomForestClassifier instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

In [22]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, precision_score, confusion_matrix

# 1. Handle missing values (if any)
X_train = X_train.fillna(0)
X_test = X_test.fillna(0)

# 2. One-hot encode categorical features, if not already done
X_train = pd.get_dummies(X_train, drop_first=True)
X_test = pd.get_dummies(X_test, drop_first=True)

# 3. Align test set to training set (same columns)
X_train, X_test = X_train.align(X_test, join='left', axis=1, fill_value=0)

# 4. Check for any NaNs or bad types again
assert X_train.isnull().sum().sum() == 0, "NaNs in X_train"
assert X_test.isnull().sum().sum() == 0, "NaNs in X_test"

# 5. Instantiate logistic regression model
model = LogisticRegression(class_weight='balanced', max_iter=1000, random_state=42)

# 6. Train the model
model.fit(X_train, y_train)

# 7. Predict on test set
y_pred = model.predict(X_test)

# 8. Evaluate performance
print("Classification Report:\n")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("Precision Score:", precision_score(y_test, y_pred))



Classification Report:

              precision    recall  f1-score   support

           0       0.79      0.43      0.56     62713
           1       0.34      0.71      0.46     25207

    accuracy                           0.51     87920
   macro avg       0.56      0.57      0.51     87920
weighted avg       0.66      0.51      0.53     87920

Confusion Matrix:
[[27267 35446]
 [ 7295 17912]]
Precision Score: 0.3356947411822032


In [None]:
import sys
print(sys.executable)


In [None]:
!{sys.executable} -m pip install xgboost


In [None]:
import xgboost
print(xgboost.__version__)


In [24]:
from xgboost import XGBClassifier
from sklearn.metrics import classification_report

model = XGBClassifier(
    scale_pos_weight=(y == 0).sum() / (y == 1).sum(),  # class balancing
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42
)

model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print(classification_report(y_test, y_pred))


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


              precision    recall  f1-score   support

           0       0.79      0.46      0.58     62713
           1       0.34      0.69      0.46     25207

    accuracy                           0.53     87920
   macro avg       0.56      0.58      0.52     87920
weighted avg       0.66      0.53      0.55     87920



In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

params = {
    'n_estimators': [100, 200],
    'max_depth': [5, 10, None]
}

grid = GridSearchCV(
    estimator=RandomForestClassifier(class_weight='balanced', random_state=42),
    param_grid=params,
    scoring='precision',  # prioritize precision
    cv=5
)

grid.fit(X_train, y_train)

# Use best model from search
model = grid.best_estimator_
print("Best precision score from GridSearchCV:", grid.best_score_)


In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

# Define search space
param_grid = {
    'n_estimators': [100, 200, 300],      # number of trees
    'max_depth': [5, 10, 20, None],       # tree depth
    'min_samples_split': [2, 5, 10],      # min split
    'min_samples_leaf': [1, 2, 4],        # min leaf
    'max_features': ['sqrt', 'log2'],     # feature selection
}

# Instantiate model
rf = RandomForestClassifier(class_weight='balanced', random_state=42)

# Grid search, scoring on precision
grid_search = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    scoring='precision',
    cv=5,
    n_jobs=-1,
    verbose=1
)

# Fit
grid_search.fit(X_train, y_train)

# Use best model
best_rf = grid_search.best_estimator_

# Predict and evaluate
from sklearn.metrics import classification_report
y_pred = best_rf.predict(X_test)
print("Best precision score from CV:", grid_search.best_score_)
print(classification_report(y_test, y_pred))
