In [1]:
%load_ext autoreload
%autoreload 2
import pipeline
import pandas as pd

In [2]:
df = pipeline.read_load('/Users/erhla/Downloads/projects_2012_2013.csv')
#convert columns to datetime and add outcome column
df['date_posted'] = pd.to_datetime(df['date_posted'])
df['datefullyfunded'] = pd.to_datetime(df['datefullyfunded'])
df['days_to_fund'] = df['datefullyfunded'] - df['date_posted']
df['funded_within_60_days'] = pd.get_dummies(df['days_to_fund'] <= pd.Timedelta('60 days'), drop_first=True)

#constants, hardcoded factors, and pre-selected features
train_start_dates = ['06/01/2012', '12/01/2012', '06/01/2013']
date_col = 'date_posted'
test_length = '26 w'
test_train_offset = '60 d'
cols_to_fill = ['students_reached']
cols_to_drop_nas = ['primary_focus_area', 'resource_type', 'grade_level']
y_col = 'funded_within_60_days'

feature_dict = {'students_reached': 'discretized',
                'total_price_including_optional_support': 'discretized',
                'school_charter': 'dummy',
                'school_magnet': 'dummy',
                'eligible_double_your_impact_match': 'dummy',
                'teacher_prefix': 'dummy',
                'poverty_level': 'dummy',
                'grade_level': 'dummy',
                'primary_focus_area': 'dummy',
                'resource_type': 'dummy'
               }

In [3]:
#creating master df
df = pipeline.preprocess(df, cols_to_fill, cols_to_drop_nas)
df, feature_ls = pipeline.generate_features(df, feature_dict, 10)

53 nas filled for students_reached
students_reached discretized
total_price_including_optional_support discretized
school_charter has values:  ['f' 't']
dummy created for school_charter
school_magnet has values:  ['f' 't']
dummy created for school_magnet
eligible_double_your_impact_match has values:  ['f' 't']
dummy created for eligible_double_your_impact_match
teacher_prefix has values:  ['Mrs.' 'Ms.' 'Mr.' 'Dr.']
target variable has more than two values, multiple dummies created
poverty_level has values:  ['highest poverty' 'high poverty' 'low poverty' 'moderate poverty']
target variable has more than two values, multiple dummies created
grade_level has values:  ['Grades PreK-2' 'Grades 3-5' 'Grades 9-12' 'Grades 6-8']
target variable has more than two values, multiple dummies created
primary_focus_area has values:  ['Math & Science' 'History & Civics' 'Literacy & Language'
 'Applied Learning' 'Music & The Arts' 'Health & Sports' 'Special Needs']
target variable has more than two val

In [4]:
#creating clusters
clustered = df.copy()
clustered = pipeline.create_clusters(clustered, feature_ls, 5)

## What type of clusters were found?

Arbitrarily using five clusters, clusters for the overall submitted projects can be categorized as follows utilizing decision tree feature importance scores and mean feature comparisons:

In [5]:
pipeline.explore_clusters_2(clustered, feature_ls, y_col, 0.2)


 cluster:  0 (has 31050 values) 
 the following features have greatest feature importance:
total_price_including_optional_support
eligible_double_your_impact_match
the following features have means greater than 50% different from the dataset average
teacher_prefix_Dr.                       -100.000000
pred_label                               -100.000000
grade_level_Grades PreK-2                 -58.288262
primary_focus_area_Special Needs          -51.069561
total_price_including_optional_support     50.519612
teacher_prefix_Mr.                         64.416570
grade_level_Grades 6-8                     71.816191
grade_level_Grades 9-12                    87.003924
resource_type_Trips                       145.359098
resource_type_Visitors                    150.466403
dtype: float64

 cluster:  1 (has 24403 values) 
 the following features have greatest feature importance:
total_price_including_optional_support
eligible_double_your_impact_match
the following features have means great

We can then see for example that cluster 0 includes projects which focus on grades 6-12 over those which focus on k-2, cluster 1 focuses on special needs, and cluster 2 focuses on teachers with the prefix mister and Helath & Sports projects.

In [6]:
#example merging clusters 2 and 3
tmp = pipeline.merge_cluster(clustered, [2, 3])
tmp['pred_label'].unique()

array([0, 5, 1, 4], dtype=int64)

In [7]:
#example splitting cluster 4 into three clusters (renumbers as clusters 5 through 7)
tmp2 = pipeline.split_cluster(clustered, feature_ls, 4, 3)
tmp2['pred_label'].unique()

array([0, 3, 1, 5, 6, 2, 7], dtype=int64)

In [8]:
#example recluster
tmp3 = pipeline.create_clusters(clustered, feature_ls, 10)
tmp3['pred_label'].unique()

array([1, 0, 4, 6, 2, 7, 5, 9, 3, 8], dtype=int64)

In [9]:
#import results from hw5
results = pd.read_excel('../hw5/results.xlsx')

#model with top 5% precision
top_model = results.loc[results['5_precision'].idxmax(axis=1)]

#getting testing and training data top model was trained/tested on
new_df = pipeline.read_load('/Users/erhla/Downloads/projects_2012_2013.csv')

#convert columns to datetime and add outcome column
new_df['date_posted'] = pd.to_datetime(new_df['date_posted'])
new_df['datefullyfunded'] = pd.to_datetime(new_df['datefullyfunded'])
new_df['days_to_fund'] = new_df['datefullyfunded'] - new_df['date_posted']
new_df['funded_within_60_days'] = pd.get_dummies(new_df['days_to_fund'] <= pd.Timedelta('60 days'), drop_first=True)
test, train = pipeline.time_split(new_df, date_col, top_model['train_start'], test_length, test_train_offset)
train = pipeline.preprocess(train, cols_to_fill, cols_to_drop_nas)
test = pipeline.preprocess(test, cols_to_fill, cols_to_drop_nas)
train, feature_ls = pipeline.generate_features(train, feature_dict, 10)
test, feature_ls2 = pipeline.generate_features(test, feature_dict, 10)
x_cols = list(set(feature_ls) & set(feature_ls2)) #include only feature columns which appear in both testing/training

39 nas filled for students_reached
students_reached discretized
total_price_including_optional_support discretized
school_charter has values:  ['f' 't']
dummy created for school_charter
school_magnet has values:  ['f' 't']
dummy created for school_magnet
eligible_double_your_impact_match has values:  ['t' 'f']
dummy created for eligible_double_your_impact_match
teacher_prefix has values:  ['Mrs.' 'Ms.' 'Mr.']
target variable has more than two values, multiple dummies created
poverty_level has values:  ['highest poverty' 'high poverty' 'moderate poverty' 'low poverty']
target variable has more than two values, multiple dummies created
grade_level has values:  ['Grades 3-5' 'Grades PreK-2' 'Grades 9-12' 'Grades 6-8']
target variable has more than two values, multiple dummies created
primary_focus_area has values:  ['History & Civics' 'Literacy & Language' 'Math & Science'
 'Music & The Arts' 'Applied Learning' 'Health & Sports' 'Special Needs']
target variable has more than two values, m

In [10]:
#get predicted scores from top model
y_test_predicted = pipeline.build_models(test[x_cols], test[y_col], train[x_cols], train[y_col], [top_model['type']], top_model['parameters'])

#add predicted scores to testing
test['pred_score'] = y_test_predicted

#get 5% of testing data with highest predicted score
fifth_percent_index = test.sort_values('pred_score', ascending=False).index[(int(test.shape[0]*0.05))]
fifth_percent_pred_score = test.loc[fifth_percent_index]['pred_score']
top_five_pred_scores = test[test['pred_score'] > fifth_percent_pred_score].copy()


In [11]:
#add clusters
top_five_pred_scores = pipeline.create_clusters(top_five_pred_scores, feature_ls, 5)

#explore clusters
pipeline.explore_clusters_2(top_five_pred_scores, feature_ls, y_col, 0.2)


 cluster:  0 (has 569 values) 
 the following features have greatest feature importance:
resource_type_Other
the following features have means greater than 50% different from the dataset average
pred_label                  -100.000000
resource_type_Visitors       -76.669596
poverty_level_low poverty    -68.892794
dtype: float64

 cluster:  1 (has 365 values) 
 the following features have greatest feature importance:
students_reached
teacher_prefix_Mrs.
the following features have means greater than 50% different from the dataset average
primary_focus_area_History & Civics    -79.700542
students_reached                       -77.817618
resource_type_Trips                    -70.158061
eligible_double_your_impact_match      -66.999869
primary_focus_area_Applied Learning    -53.071144
pred_label                             -52.333932
primary_focus_area_Music & The Arts    -50.121331
primary_focus_area_Health & Sports      69.726027
primary_focus_area_Special Needs       295.324598
dtype:

The above clusters were generated from testing data including only projects which had predicted scores in the top 5% from the model with the highest precision at 5% population.

These models then were clustered into 5 clusters. Cluster 0 for example, is much less likely to include projects which focused on low poverty areas. 