In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing import sequence, text
from keras.layers import Input, Embedding

from nltk import word_tokenize
from nltk.corpus import stopwords
from textblob import TextBlob

import datetime as dt
import pandas as pd
import numpy as np
import warnings
import string

# stop_words = []
stop_words = list(set(stopwords.words('english')))
warnings.filterwarnings('ignore')
punctuation = string.punctuation

Using TensorFlow backend.


In [12]:
id_column = "id"
missing_token = " UNK "

train = pd.read_csv("train.csv", parse_dates=["project_submitted_datetime"])
test = pd.read_csv("test.csv", parse_dates=["project_submitted_datetime"])
rc = pd.read_csv("resources.csv").fillna(missing_token)

df = pd.concat([train, test], axis=0) 

In [13]:
df.head()

Unnamed: 0,id,project_essay_1,project_essay_2,project_essay_3,project_essay_4,project_grade_category,project_is_approved,project_resource_summary,project_subject_categories,project_subject_subcategories,project_submitted_datetime,project_title,school_state,teacher_id,teacher_number_of_previously_posted_projects,teacher_prefix
0,p036502,Most of my kindergarten students come from low...,I currently have a differentiated sight word c...,,,Grades PreK-2,1.0,My students need 6 Ipod Nano's to create and d...,Literacy & Language,Literacy,2016-11-18 14:45:59,Super Sight Word Centers,NV,484aaf11257089a66cfedc9461c6bd0a,26,Ms.
1,p039565,Our elementary school is a culturally rich sch...,We strive to provide our diverse population of...,,,Grades 3-5,0.0,My students need matching shirts to wear for d...,"Music & The Arts, Health & Sports","Performing Arts, Team Sports",2017-04-26 15:57:28,Keep Calm and Dance On,GA,df72a3ba8089423fa8a94be88060f6ed,1,Mrs.
2,p233823,Hello;\r\nMy name is Mrs. Brotherton. I teach ...,We are looking to add some 3Doodler to our cla...,,,Grades 3-5,1.0,My students need the 3doodler. We are an SEM s...,"Math & Science, Literacy & Language","Applied Sciences, Literature & Writing",2017-01-01 22:57:44,Lets 3Doodle to Learn,UT,a9b876a9252e08a55e3d894150f75ba3,5,Ms.
3,p185307,My students are the greatest students but are ...,"The student's project which is totally \""kid-i...",,,Grades 3-5,0.0,My students need balls and other activity equi...,Health & Sports,Health & Wellness,2016-08-12 15:42:11,"\""Kid Inspired\"" Equipment to Increase Activit...",NC,525fdbb6ec7f538a48beebaa0a51b24f,16,Mr.
4,p013780,My students are athletes and students who are ...,For some reason in our kitchen the water comes...,,,Grades 6-8,1.0,My students need a water filtration system for...,Health & Sports,Health & Wellness,2016-08-06 09:09:11,We need clean water for our culinary arts class!,CA,a63b5547a7239eae4c1872670848e61a,42,Mr.


In [16]:
# fillup empty values with missing token 
df['project_essay_3'] = df['project_essay_3'].fillna(missing_token)
df['project_essay_4'] = df['project_essay_4'].fillna(missing_token)

# extract length of each essay and title
df["essay1_len"] = df['project_essay_1'].apply(len)
df["essay2_len"] = df['project_essay_2'].apply(len)
df["essay3_len"] = df['project_essay_3'].apply(len)
df["essay4_len"] = df['project_essay_4'].apply(len)
df["title_len"] = df['project_title'].apply(len)

In [17]:
df[['essay1_len', 'essay2_len', 'essay3_len', 'essay4_len', 'title_len']].head(10)

Unnamed: 0,essay1_len,essay2_len,essay3_len,essay4_len,title_len
0,967,805,5,5,24
1,587,639,5,5,22
2,761,546,5,5,21
3,1201,1209,5,5,72
4,451,556,5,5,48
5,492,737,5,5,37
6,789,931,5,5,21
7,458,629,5,5,36
8,573,774,5,5,37
9,624,710,5,5,27


In [22]:
# combine the project essays to create a complete essay text
df['text'] = df.apply(lambda row: ' '.join([str(row['project_essay_1']), 
                                            str(row['project_essay_2']), 
                                            str(row['project_essay_3']), 
                                            str(row['project_essay_4'])]), axis=1)

# extract features from text
df['char_count'] = df['text'].apply(len)
df['word_count'] = df['text'].apply(lambda x: len(x.split()))
df['word_density'] = df['char_count'] / (df['word_count']+1)
df['punctuation_count'] = df['text'].apply(lambda x: len("".join(_ for _ in x if _ in punctuation))) 
df['title_word_count'] = df['text'].apply(lambda x: len([wrd for wrd in x.split() if wrd.istitle()]))
df['upper_case_word_count'] = df['text'].apply(lambda x: len([wrd for wrd in x.split() if wrd.isupper()]))
df['stopword_count'] = df['text'].apply(lambda x: len([wrd for wrd in x.split() if wrd.lower() in stop_words]))

In [24]:
df[['char_count', 'word_count', 'word_density', 'punctuation_count', 'title_word_count', 'upper_case_word_count', 'stopword_count']].head(10)

Unnamed: 0,char_count,word_count,word_density,punctuation_count,title_word_count,upper_case_word_count,stopword_count
0,1785,314,5.666667,40,21,7,151
1,1239,192,6.419689,38,15,5,79
2,1320,236,5.56962,26,26,6,103
3,2423,388,6.228792,77,31,6,188
4,1020,187,5.425532,15,13,2,98
5,1242,194,6.369231,54,21,5,88
6,1733,282,6.123675,43,23,5,127
7,1100,184,5.945946,45,16,2,75
8,1360,239,5.666667,31,11,6,111
9,1347,240,5.589212,36,16,10,114


In [26]:
temp=df.groupby('project_title')['project_is_approved'].agg(['sum','count'])
temp['approval_rate']=(temp['sum']*100)/temp['count']
temp.columns=['# of projects approved','# of total projects','Approval rate']
temp=temp.sort_values(by='# of total projects',ascending=False)
temp=temp.iloc[0:20]
temp

Unnamed: 0_level_0,# of projects approved,# of total projects,Approval rate
project_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Flexible Seating,298.0,377,79.045093
Wiggle While You Work,137.0,150,91.333333
Wiggle While We Work,136.0,149,91.275168
Can You Hear Me Now?,135.0,144,93.75
Wiggle While You Work!,122.0,134,91.044776
Wiggle While We Work!,116.0,123,94.308943
"We Like to Move It, Move It!",85.0,100,85.0
Listen Up!,77.0,83,92.771084
Technology in the Classroom,69.0,79,87.341772
Full STEAM Ahead!,57.0,74,77.027027
