In [11]:

import numpy as np
import pandas as pd
import os

try:
    import cPickle as pickle
except BaseException:
    import pickle

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold, RepeatedKFold
from sklearn.preprocessing import LabelEncoder

import lightgbm as lgb

# Load Data

In [12]:
donorsJson = np.load('test_dict.npy').item()

In [13]:
donorsJson

{'id': ['p233245'],
 'project_essay_1': ["My 2nd grade students are amazing!  They are very creative and enjoy art.  Unfortunately, we don't have a place to dry the student created painting or artwork.  A drying art rack will help keep the art work in a secure place and away from the floor."],
 'project_essay_2': ['My class is made up of 12 boys and 12 girls.  They are the best second graders!  They are very smart and articulate.  They enjoy reading and sharing their stories with the rest of the class.  My class enjoys the time they get to be spontaneous and creative as they are making original art creations.'],
 'project_essay_3': ['My second grade class will really benefit from having a drying rack inside the classroom.  They will no longer have to go outside to hand their artwork on the fence.  The drying art rack will also be beneficial in keeping the art well organized and neat.'],
 'project_essay_4': ['The genorous donations to my project will make a difference in that my student

In [41]:
# Define the function: 
def processInput(donorsDictionary):
    data_path = os.path.join('',)

    # Load the dictionary that you want to test: 
    input_dict = donorsDictionary
    user_input = pd.DataFrame.from_dict(input_dict, dtype=str)
    user_input['teacher_number_of_previously_posted_projects'] = user_input['teacher_number_of_previously_posted_projects'].astype(int)

    print('shape of user input: ', user_input.shape)
    
    # load resources
    res = pd.read_csv(os.path.join(data_path, 'resources.csv'))

    # Preprocess data
    user_input['project_essay'] = user_input.apply(lambda row: ' '.join([
        str(row['project_essay_1']),
        str(row['project_essay_2']), 
        str(row['project_essay_3']), 
        str(row['project_essay_4']),
        ]), axis=1)

    # Extract features
    def extract_features(df):
        df['project_title_len'] = df['project_title'].apply(lambda x: len(str(x)))
        df['project_essay_1_len'] = df['project_essay_1'].apply(lambda x: len(str(x)))
        df['project_essay_2_len'] = df['project_essay_2'].apply(lambda x: len(str(x)))
        df['project_essay_3_len'] = df['project_essay_3'].apply(lambda x: len(str(x)))
        df['project_essay_4_len'] = df['project_essay_4'].apply(lambda x: len(str(x)))
        df['project_resource_summary_len'] = df['project_resource_summary'].apply(lambda x: len(str(x)))

        df['project_title_wc'] = df['project_title'].apply(lambda x: len(str(x).split(' ')))
        df['project_essay_1_wc'] = df['project_essay_1'].apply(lambda x: len(str(x).split(' ')))
        df['project_essay_2_wc'] = df['project_essay_2'].apply(lambda x: len(str(x).split(' ')))
        df['project_essay_3_wc'] = df['project_essay_3'].apply(lambda x: len(str(x).split(' ')))
        df['project_essay_4_wc'] = df['project_essay_4'].apply(lambda x: len(str(x).split(' ')))
        df['project_resource_summary_wc'] = df['project_resource_summary'].apply(lambda x: len(str(x).split(' ')))

    extract_features(user_input)

    user_input.drop([
        'project_essay_1', 
        'project_essay_2', 
        'project_essay_3', 
        'project_essay_4'], axis=1, inplace=True)

    df_all = user_input

    # init results
    res = pd.DataFrame(res[['id', 'quantity', 'price']].groupby('id').agg(\
        {
            'quantity': [
                'sum',
                'min', 
                'max', 
                'mean', 
                'std', 
                # lambda x: len(np.unique(x)),
            ],
            'price': [
                'count', 
                'sum', 
                'min',
                'max', 
                'mean', 
                'std', 
                lambda x: len(np.unique(x)),
            ]}
        )).reset_index()
    res.columns = ['_'.join(col) for col in res.columns]
    res.rename(columns={'id_': 'id'}, inplace=True)
    res['mean_price'] = res['price_sum']/res['quantity_sum']

    print('results column: ', res.head())
    user_input = user_input.merge(res, on='id', how='left')
    del res

    # Preprocess columns with label encoder
    print('Label Encoder...')
    cols = [
        'teacher_id', 
        'teacher_prefix', 
        'school_state', 
        'project_grade_category',
        'project_subject_categories', 
        'project_subject_subcategories'
    ]
    inverse_transform = {}
    for c in cols:
        le = LabelEncoder()
        le.fit(df_all[c].astype(str))
        user_input[c] = le.transform(user_input[c].astype(str))
        inverse_transform[c] = {le.inverse_transform(user_input[c]), user_input[c])
    
    print('Done. user input:\n', user_input)


    # Preprocess timestamp
    print('Preprocessing timestamp...')
    def process_timestamp(df):
        df['year'] = df['project_submitted_datetime'].apply(lambda x: int(x.split('-')[0]))
        df['month'] = df['project_submitted_datetime'].apply(lambda x: int(x.split('-')[1]))
        df['date'] = df['project_submitted_datetime'].apply(lambda x: int(x.split(' ')[0].split('-')[2]))
        df['day_of_week'] = pd.to_datetime(df['project_submitted_datetime']).dt.weekday
        df['hour'] = df['project_submitted_datetime'].apply(lambda x: int(x.split(' ')[-1].split(':')[0]))
        df['minute'] = df['project_submitted_datetime'].apply(lambda x: int(x.split(' ')[-1].split(':')[1]))
        df['project_submitted_datetime'] = pd.to_datetime(df['project_submitted_datetime']).values.astype(np.int64)

    process_timestamp(user_input)
    print('Done.')

    # Preprocess text
    print('Preprocessing text...')
    cols = [
        'project_title', 
        'project_essay', 
        'project_resource_summary'
    ]
    n_features = [
        400, 
        4040, 
        400,
    ]

    with open('project_title_tfidf.pk', 'rb') as f:
        project_title_tfidf = pickle.load(f)

#     tfidf = TfidfVectorizer(
#         max_features=n_features[0],
#         norm='l2',
#         )
    print('project title tfidf vocab:\n', project_title_tfidf.vocabulary_)
    
    project_title_vocab = project_title_tfidf.vocabulary_
    
    tfidf_test = np.array(project_title_tfidf.transform(user_input['project_title']).toarray(), dtype=np.float16)
    
    for i in range(n_features[0]):
            user_input[c + '_tfidf_' + str(i)] = tfidf_test[:, i]
    del project_title_tfidf, tfidf_test
    gc.collect()
    print('project_title_tfidf Done.')

    with open('project_essay_tfidf.pk', 'rb') as f:
        project_essay_tfidf = pickle.load(f)

#     tfidf = TfidfVectorizer(
#         max_features=n_features[1],
#         norm='l2',
#         )

    tfidf_test = np.array(project_essay_tfidf.transform(user_input['project_resource_summary']).toarray(), dtype=np.float16)

    for i in range(n_features[1]):
            user_input[c + '_tfidf_' + str(i)] = tfidf_test[:, i]
    del project_essay_tfidf, tfidf_test
    gc.collect()
    print('project_essay_tfidf Done.')

    with open('project_resource_summary_tfidf.pk', 'rb') as f:
        project_resource_summary_tfidf = pickle.load(f)

#     tfidf = TfidfVectorizer(
#         max_features=n_features[2],
#         norm='l2',
#         )

    tfidf_test = np.array(project_resource_summary_tfidf.transform(user_input['project_resource_summary']).toarray(), dtype=np.float16)

    for i in range(n_features[2]):
            user_input[c + '_tfidf_' + str(i)] = tfidf_test[:, i]
    del project_resource_summary_tfidf, tfidf_test
    gc.collect()
    print('project_resource_summary_tfidf Done.')

    # Prepare data
    cols_to_drop = [
        'id',
        'teacher_id',
        'project_title', 
        'project_essay',
        'project_resource_summary',
        'project_is_approved',
    ]
    
    X_test = user_input.drop(cols_to_drop, axis=1, errors='ignore')
    id_test = user_input['id'].values

    # Build the model
    cnt = 0
    p_buf = []
    n_splits = 5
    n_repeats = 1

    auc_buf = []   

    # load model to predict

    print('Load model to predict')
    imported_model = pickle.load( open( "model_v1.pkl", "rb" ) )
    print('model loaded')
    #predict
    results = imported_model.predict(X_test)
    features = X_test
    feature_names = X_test.columns.tolist()
    return results[0], features, feature_names, project_title_vocab, inverse_transform

In [42]:
# Test the function: 
results, features, feature_names, project_title_vocab, inverse_transform = processInput(donorsJson)

shape of user input:  (1, 15)
results column:          id  quantity_sum  quantity_min  quantity_max  quantity_mean  \
0  p000001             7             1             2       1.750000   
1  p000002            21             1             4       1.500000   
2  p000003             4             1             1       1.000000   
3  p000004            98             1             2       1.031579   
4  p000005             8             1             3       2.000000   

   quantity_std  price_count  price_sum  price_min  price_max  price_mean  \
0      0.500000            4     459.56      23.99     261.08  114.890000   
1      0.854850           14     515.89       8.46     134.90   36.849286   
2      0.000000            4     298.97      39.99     169.00   74.742500   
3      0.175804           95    1113.69       1.60     401.54   11.723053   
4      1.154701            4     485.99      54.08     323.75  121.497500   

    price_std  price_<lambda>  mean_price  
0  101.929679      

In [43]:
project_title_vocab

{'1st': 0,
 '21st': 1,
 '2nd': 2,
 '3d': 3,
 '3rd': 4,
 '4th': 5,
 '5th': 6,
 'about': 7,
 'access': 8,
 'action': 9,
 'active': 10,
 'activities': 11,
 'ahead': 12,
 'alive': 13,
 'all': 14,
 'alternative': 15,
 'amazing': 16,
 'an': 17,
 'and': 18,
 'apple': 19,
 'are': 20,
 'around': 21,
 'art': 22,
 'artists': 23,
 'arts': 24,
 'as': 25,
 'at': 26,
 'autism': 27,
 'away': 28,
 'awesome': 29,
 'back': 30,
 'backpacks': 31,
 'ball': 32,
 'balls': 33,
 'band': 34,
 'based': 35,
 'basic': 36,
 'be': 37,
 'become': 38,
 'best': 39,
 'better': 40,
 'beyond': 41,
 'big': 42,
 'board': 43,
 'boards': 44,
 'bodies': 45,
 'body': 46,
 'book': 47,
 'books': 48,
 'bounce': 49,
 'bouncing': 50,
 'brain': 51,
 'brains': 52,
 'bring': 53,
 'bringing': 54,
 'build': 55,
 'building': 56,
 'but': 57,
 'by': 58,
 'calm': 59,
 'camera': 60,
 'can': 61,
 'carpet': 62,
 'center': 63,
 'centers': 64,
 'century': 65,
 'chairs': 66,
 'children': 67,
 'choice': 68,
 'chrome': 69,
 'chromebook': 70,
 'chrome

In [44]:
inverse_transform

{'project_grade_category': array(['Grades PreK-2'], dtype=object),
 'project_subject_categories': array(['Music & The Arts'], dtype=object),
 'project_subject_subcategories': array(['Visual Arts'], dtype=object),
 'school_state': array(['CA'], dtype=object),
 'teacher_id': array(['5724a0c3ce11008366fff36dab4b943c'], dtype=object),
 'teacher_prefix': array(['Ms.'], dtype=object)}

In [35]:
features.to_dict()

{'project_grade_category': {0: 0},
 'project_subject_categories': {0: 0},
 'project_subject_subcategories': {0: 0},
 'project_submitted_datetime': {0: 1461764741000000000},
 'school_state': {0: 0},
 'teacher_number_of_previously_posted_projects': {0: 2},
 'teacher_prefix': {0: 0},
 'project_title_len': {0: 24},
 'project_essay_1_len': {0: 249},
 'project_essay_2_len': {0: 299},
 'project_essay_3_len': {0: 254},
 'project_essay_4_len': {0: 185},
 'project_resource_summary_len': {0: 54},
 'project_title_wc': {0: 5},
 'project_essay_1_wc': {0: 49},
 'project_essay_2_wc': {0: 59},
 'project_essay_3_wc': {0: 48},
 'project_essay_4_wc': {0: 34},
 'project_resource_summary_wc': {0: 10},
 'quantity_sum': {0: 1},
 'quantity_min': {0: 1},
 'quantity_max': {0: 1},
 'quantity_mean': {0: 1.0},
 'quantity_std': {0: nan},
 'price_count': {0: 1},
 'price_sum': {0: 149.0},
 'price_min': {0: 149.0},
 'price_max': {0: 149.0},
 'price_mean': {0: 149.0},
 'price_std': {0: nan},
 'price_<lambda>': {0: 1.0},

4078

1

In [13]:
X_test

Unnamed: 0,project_grade_category,project_subject_categories,project_subject_subcategories,project_submitted_datetime,school_state,teacher_number_of_previously_posted_projects,teacher_prefix,project_title_len,project_essay_1_len,project_essay_2_len,...,project_subject_subcategories_tfidf_4030,project_subject_subcategories_tfidf_4031,project_subject_subcategories_tfidf_4032,project_subject_subcategories_tfidf_4033,project_subject_subcategories_tfidf_4034,project_subject_subcategories_tfidf_4035,project_subject_subcategories_tfidf_4036,project_subject_subcategories_tfidf_4037,project_subject_subcategories_tfidf_4038,project_subject_subcategories_tfidf_4039
0,0,0,0,1461764741000000000,0,2,0,24,249,299,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
training_csv = pd.read_csv('train.csv')

In [10]:
training_csv.head()

Unnamed: 0,id,teacher_id,teacher_prefix,school_state,project_submitted_datetime,project_grade_category,project_subject_categories,project_subject_subcategories,project_title,project_essay_1,project_essay_2,project_essay_3,project_essay_4,project_resource_summary,teacher_number_of_previously_posted_projects,project_is_approved
0,p036502,484aaf11257089a66cfedc9461c6bd0a,Ms.,NV,2016-11-18 14:45:59,Grades PreK-2,Literacy & Language,Literacy,Super Sight Word Centers,Most of my kindergarten students come from low...,I currently have a differentiated sight word c...,,,My students need 6 Ipod Nano's to create and d...,26,1
1,p039565,df72a3ba8089423fa8a94be88060f6ed,Mrs.,GA,2017-04-26 15:57:28,Grades 3-5,"Music & The Arts, Health & Sports","Performing Arts, Team Sports",Keep Calm and Dance On,Our elementary school is a culturally rich sch...,We strive to provide our diverse population of...,,,My students need matching shirts to wear for d...,1,0
2,p233823,a9b876a9252e08a55e3d894150f75ba3,Ms.,UT,2017-01-01 22:57:44,Grades 3-5,"Math & Science, Literacy & Language","Applied Sciences, Literature & Writing",Lets 3Doodle to Learn,Hello;\r\nMy name is Mrs. Brotherton. I teach ...,We are looking to add some 3Doodler to our cla...,,,My students need the 3doodler. We are an SEM s...,5,1
3,p185307,525fdbb6ec7f538a48beebaa0a51b24f,Mr.,NC,2016-08-12 15:42:11,Grades 3-5,Health & Sports,Health & Wellness,"\""Kid Inspired\"" Equipment to Increase Activit...",My students are the greatest students but are ...,"The student's project which is totally \""kid-i...",,,My students need balls and other activity equi...,16,0
4,p013780,a63b5547a7239eae4c1872670848e61a,Mr.,CA,2016-08-06 09:09:11,Grades 6-8,Health & Sports,Health & Wellness,We need clean water for our culinary arts class!,My students are athletes and students who are ...,For some reason in our kitchen the water comes...,,,My students need a water filtration system for...,42,1


In [35]:
columns_list = training_csv.columns.tolist()
columns_list

['id',
 'teacher_id',
 'teacher_prefix',
 'school_state',
 'project_submitted_datetime',
 'project_grade_category',
 'project_subject_categories',
 'project_subject_subcategories',
 'project_title',
 'project_essay_1',
 'project_essay_2',
 'project_essay_3',
 'project_essay_4',
 'project_resource_summary',
 'teacher_number_of_previously_posted_projects',
 'project_is_approved']

In [32]:
teacher_prefix = training_csv.groupby('teacher_prefix').count().index.tolist()

In [29]:
project_subject_subcategories = training_csv.groupby('project_subject_subcategories').count().index.tolist()

In [30]:
project_subject_categories = training_csv.groupby('project_subject_categories').count().index.tolist()

In [33]:
teacher_number_of_previously_posted_projects = training_csv.groupby('teacher_number_of_previously_posted_projects').count().index.tolist()
teacher_number_of_previously_posted_projects

[0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53,
 54,
 55,
 56,
 57,
 58,
 59,
 60,
 61,
 62,
 63,
 64,
 65,
 66,
 67,
 68,
 69,
 70,
 71,
 72,
 73,
 74,
 75,
 76,
 77,
 78,
 79,
 80,
 81,
 82,
 83,
 84,
 85,
 86,
 87,
 88,
 89,
 90,
 91,
 92,
 93,
 94,
 95,
 96,
 97,
 98,
 99,
 100,
 101,
 102,
 103,
 104,
 105,
 106,
 107,
 108,
 109,
 110,
 111,
 112,
 113,
 114,
 115,
 116,
 117,
 118,
 119,
 120,
 121,
 122,
 123,
 124,
 125,
 126,
 127,
 128,
 129,
 130,
 131,
 132,
 133,
 134,
 135,
 136,
 137,
 138,
 139,
 140,
 141,
 142,
 143,
 144,
 145,
 146,
 147,
 148,
 149,
 150,
 151,
 152,
 153,
 154,
 155,
 156,
 157,
 158,
 159,
 160,
 161,
 162,
 163,
 164,
 165,
 166,
 167,
 168,
 169,
 170,
 171,
 172,
 173,
 174,
 175,
 176,
 177,
 178,
 179,
 180,
 181,
 182,
 183,
 184,
