In [7]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score

# Load outcomes.csv
outcomes = pd.read_csv('outcomes.csv')
projects = pd.read_csv('projects.csv')

# Convert 't' (True) → 1, 'f' (False) → 0
outcomes['fully_funded'] = outcomes['fully_funded'].map({'t': 1, 'f': 0})
outcomes['not_fully_funded'] = outcomes['fully_funded'].apply(lambda x: 0 if x == 1 else 1)
projects['date_posted'] = pd.to_datetime(projects['date_posted'])

# Confirm the conversion
print(outcomes['not_fully_funded'].value_counts())
print(outcomes[['projectid','is_exciting', 'not_fully_funded']].head())

# sort by score desc
# rank / if rank is 10% rank is top 10% or 0
# score to rank to prediction 

not_fully_funded
0    430683
1    188643
Name: count, dtype: int64
                          projectid is_exciting  not_fully_funded
0  ffffc4f85b60efc5b52347df489d0238           f                 1
1  ffffac55ee02a49d1abc87ba6fc61135           f                 0
2  ffff97ed93720407d70a2787475932b0           f                 0
3  ffff418bb42fad24347527ad96100f81           f                 1
4  ffff2d9c769c8fb5335e949c615425eb           t                 0


In [8]:
# Merging datasets 
data = pd.merge(projects, outcomes[['projectid', 'not_fully_funded']], on='projectid')
data = data[data['date_posted'] >= '2010-01-01']

# Feature matrix / train-test split / performance matrix 
# Numerical Feature to predict precision / Logisitic Regression

In [9]:
# Feature Generation 
features = ['primary_focus_subject', 'school_state']  # example
X = pd.get_dummies(data[features], drop_first=True)
y = data['not_fully_funded']

# Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize Logistic Regression model
log_reg = LogisticRegression(class_weight='balanced', max_iter=1000, random_state=42)

# Train model
log_reg.fit(X_train, y_train)

# Predict on the test set
y_pred = log_reg.predict(X_test)

# Evaluate precision
precision = precision_score(y_test, y_pred)
print("Precision Score with two features:", precision)

Precision Score with two features: 0.3380939701290213
