In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics as mt
from sklearn.cross_validation import StratifiedShuffleSplit
from keras.preprocessing import sequence, text
from keras.layers import Input, Embedding

from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk import FreqDist
# from textblob import TextBlob

import datetime as dt
import pandas as pd
import numpy as np
import warnings
import string

stop_words = []
stop_words = list(set(stopwords.words('english')))
warnings.filterwarnings('ignore')
punctuation = string.punctuation

Using TensorFlow backend.


In [2]:
id_column = "id"
missing_token = " UNK "

df = pd.read_csv("train.csv", parse_dates=["project_submitted_datetime"])
# test = pd.read_csv("test.csv", parse_dates=["project_submitted_datetime"])
# rc = pd.read_csv("resources.csv").fillna(missing_token)

#df = pd.concat([train, test], axis=0) 

In [3]:
df.head()

Unnamed: 0,id,teacher_id,teacher_prefix,school_state,project_submitted_datetime,project_grade_category,project_subject_categories,project_subject_subcategories,project_title,project_essay_1,project_essay_2,project_essay_3,project_essay_4,project_resource_summary,teacher_number_of_previously_posted_projects,project_is_approved
0,p036502,484aaf11257089a66cfedc9461c6bd0a,Ms.,NV,2016-11-18 14:45:59,Grades PreK-2,Literacy & Language,Literacy,Super Sight Word Centers,Most of my kindergarten students come from low...,I currently have a differentiated sight word c...,,,My students need 6 Ipod Nano's to create and d...,26,1
1,p039565,df72a3ba8089423fa8a94be88060f6ed,Mrs.,GA,2017-04-26 15:57:28,Grades 3-5,"Music & The Arts, Health & Sports","Performing Arts, Team Sports",Keep Calm and Dance On,Our elementary school is a culturally rich sch...,We strive to provide our diverse population of...,,,My students need matching shirts to wear for d...,1,0
2,p233823,a9b876a9252e08a55e3d894150f75ba3,Ms.,UT,2017-01-01 22:57:44,Grades 3-5,"Math & Science, Literacy & Language","Applied Sciences, Literature & Writing",Lets 3Doodle to Learn,Hello;\r\nMy name is Mrs. Brotherton. I teach ...,We are looking to add some 3Doodler to our cla...,,,My students need the 3doodler. We are an SEM s...,5,1
3,p185307,525fdbb6ec7f538a48beebaa0a51b24f,Mr.,NC,2016-08-12 15:42:11,Grades 3-5,Health & Sports,Health & Wellness,"\""Kid Inspired\"" Equipment to Increase Activit...",My students are the greatest students but are ...,"The student's project which is totally \""kid-i...",,,My students need balls and other activity equi...,16,0
4,p013780,a63b5547a7239eae4c1872670848e61a,Mr.,CA,2016-08-06 09:09:11,Grades 6-8,Health & Sports,Health & Wellness,We need clean water for our culinary arts class!,My students are athletes and students who are ...,For some reason in our kitchen the water comes...,,,My students need a water filtration system for...,42,1


In [4]:
# Generate the Category variables 

df["lsLangLit"]=df["project_subject_categories"].str.contains("Literacy & Language")
df["lsMathSci"]=df["project_subject_categories"].str.contains("Math & Science")
df["lsAppLearn"]=df["project_subject_categories"].str.contains("Applied Learning")
df["lsHealth"]=df["project_subject_categories"].str.contains("Health & Sports")
df["lsSpNeeds"]=df["project_subject_categories"].str.contains("Special Needs")
df["lsHistCiv"]=df["project_subject_categories"].str.contains("History & Civics")
df["lsMusArts"]=df["project_subject_categories"].str.contains("Music & The Arts")
df["lsWarm"]=df["project_subject_categories"].str.contains("Warmth, Care & Hunger")


In [5]:
# Count the number of categories for each record
df["CatNum"]=df[['lsLangLit','lsMathSci','lsAppLearn','lsHealth','lsSpNeeds','lsHistCiv','lsMusArts','lsWarm']].sum(1)


In [6]:
temp2 = df.groupby('CatNum')['project_is_approved'].agg(['sum','count'])
temp2['approval_rate']=(temp2['sum']*100)/temp2['count']
temp2.columns=['# of projects approved','# of total project','Approval Rate']
temp2

Unnamed: 0_level_0,# of projects approved,# of total project,Approval Rate
CatNum,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,94627,111921,84.548029
2,59719,70159,85.119514


The approval rate is approximately the same no matter how many categories the project selected. To simplify the analysis, we reduced our project data set to only those projects that indicated one category. 

In [7]:
df2 = df #preserving old df in case we need to revert
df = df[df['CatNum']==1]

In [8]:
# fillup empty values with missing token 
df['project_essay_3'] = df['project_essay_3'].fillna(missing_token)
df['project_essay_4'] = df['project_essay_4'].fillna(missing_token)

# extract length of each essay and title
df["essay1_len"] = df['project_essay_1'].apply(len)
df["essay2_len"] = df['project_essay_2'].apply(len)
df["essay3_len"] = df['project_essay_3'].apply(len)
df["essay4_len"] = df['project_essay_4'].apply(len)
df["title_len"] = df['project_title'].apply(len)

In [9]:
df[['essay1_len', 'essay2_len', 'essay3_len', 'essay4_len', 'title_len']].head(10)

Unnamed: 0,essay1_len,essay2_len,essay3_len,essay4_len,title_len
0,967,805,5,5,24
3,1201,1209,5,5,72
4,451,556,5,5,48
6,789,931,5,5,21
8,573,774,5,5,37
9,624,710,5,5,27
13,506,576,5,5,52
14,625,669,5,5,16
17,1146,1269,5,5,41
20,653,646,5,5,37


In [10]:
# combine the project essays to create a complete essay text
df['text'] = df.apply(lambda row: ' '.join([str(row['project_essay_1']), 
                                            str(row['project_essay_2']), 
                                            str(row['project_essay_3']), 
                                            str(row['project_essay_4'])]), axis=1)

# extract features from text
df['char_count'] = df['text'].apply(len)
df['word_count'] = df['text'].apply(lambda x: len(x.split()))
df['word_density'] = df['char_count'] / (df['word_count']+1)
df['punctuation_count'] = df['text'].apply(lambda x: len("".join(_ for _ in x if _ in punctuation))) 
df['title_word_count'] = df['text'].apply(lambda x: len([wrd for wrd in x.split() if wrd.istitle()]))
df['upper_case_word_count'] = df['text'].apply(lambda x: len([wrd for wrd in x.split() if wrd.isupper()]))
df['stopword_count'] = df['text'].apply(lambda x: len([wrd for wrd in x.split() if wrd.lower() in stop_words]))

In [11]:
df[['char_count', 'word_count', 'word_density', 'punctuation_count', 'title_word_count', 'upper_case_word_count', 'stopword_count']].head(10)

Unnamed: 0,char_count,word_count,word_density,punctuation_count,title_word_count,upper_case_word_count,stopword_count
0,1785,314,5.666667,40,21,7,151
3,2423,388,6.228792,77,31,6,188
4,1020,187,5.425532,15,13,2,98
6,1733,282,6.123675,43,23,5,127
8,1360,239,5.666667,31,11,6,111
9,1347,240,5.589212,36,16,10,114
13,1095,193,5.64433,38,14,3,90
14,1307,219,5.940909,27,9,10,100
17,2428,400,6.054863,59,31,3,181
20,1312,205,6.368932,41,9,4,93


In [12]:
temp=df.groupby('project_title')['project_is_approved'].agg(['sum','count'])
temp['approval_rate']=(temp['sum']*100)/temp['count']
temp.columns=['# of projects approved','# of total projects','Approval rate']
temp=temp.sort_values(by='# of total projects',ascending=False)
temp=temp.iloc[0:20]
temp

Unnamed: 0_level_0,# of projects approved,# of total projects,Approval rate
project_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Flexible Seating,145,183,79.234973
Wiggle While We Work,84,93,90.322581
Can You Hear Me Now?,70,76,92.105263
Wiggle While You Work,70,75,93.333333
Wiggle While You Work!,63,66,95.454545
Wiggle While We Work!,58,62,93.548387
"We Like to Move It, Move It!",48,61,78.688525
Listen Up!,47,51,92.156863
Let's Get Moving!,40,50,80.0
Classroom Library,45,49,91.836735


In [13]:
# Cleaning up variables to get features vs. target(s)

if 'project_is_approved'in df:
    y1 = df['project_is_approved'] # Creates the approval target 
    del df['project_is_approved']
    
    
if 'project_subject_categories'in df:
    y2 = df['project_subject_categories'] # Creates the approval target if we're using the full listing and not one-hot encoded 
    # del df['project_subject_categories'] # Remove the categories from the training set
    del df['project_subject_subcategories'] # Remove the subcategories, since they are dependent on the categories

In [14]:
# Splitting here to have train vs. test data 

#train=df.sample(frac=0.5,random_state=200) #Used a small sample to allow processing (encountered memory issues)

In [18]:
# Creating the Sparse matrix to possibly use KNN?

# Started with code snippet from https://machinelearningmastery.com/prepare-text-data-machine-learning-scikit-learn/

# create the transform
vectorizer = TfidfVectorizer()
# tokenize and build vocab and transform text documents
new_text = vectorizer.fit_transform(df['text'])


In [19]:
print(df.shape)
print(new_text.shape)
print(y2.shape)

(111921, 36)
(111921, 59888)
(111921,)


In [27]:
from sklearn.preprocessing import StandardScaler
# create variables we are more familiar with
X = new_text
y = y2
yhat = np.zeros(y.shape) # we will fill this with predictions

# create cross validation iterator
cv = StratifiedKFold(n_splits=10)

# get a handle to the classifier object, which defines the type
clf = KNeighborsClassifier(n_neighbors=3)

# now iterate through and get predictions, saved to the correct row in yhat
# NOTE: you can parallelize this using the cross_val_predict method
for train, test in cv.split(X,y):
    clf.fit(X[train].astype(str),y[train].astype(str))
    yhat[test] = clf.predict(X[test])

total_accuracy = mt.accuracy_score(y, yhat)
print ('KNN accuracy', total_accuracy)

MemoryError: 

Ran into memory error trying to run this last bit. 