In [32]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

In [2]:

df_train = pd.read_csv('train.csv')
df_resources = pd.read_csv('resources.csv')
train_resource = pd.merge(df_train, df_resources, on="id", how='left')
t2 = train_resource.pivot_table(index = ['id', 'description'],
                                values='quantity', 
                                aggfunc=np.sum)

t2.reset_index(inplace=True)
train_resource = pd.merge(train_resource,
                          t2, 
                          on=['id', 'description'], 
                          how='right',
                          suffixes=('_Orig', '_True'))
train_resource = train_resource.drop_duplicates()
train_resource = train_resource.drop('quantity_Orig', axis=1)
train_resource = train_resource.rename(columns={'quantity_True': 'True_Quantity'})
train_resource['Total_Cost'] = train_resource.price * train_resource.True_Quantity

Total_cost = train_resource.pivot_table(values=['Total_Cost','True_Quantity'],
                                        index='id',
                                        aggfunc={'Total_Cost' : np.sum, 'True_Quantity' : np.sum})
Total_cost.reset_index(level=0, inplace = True)

df_train = df_train.merge(Total_cost, on='id', how='left')
df_train['Average_item_cost'] = df_train.Total_Cost  / df_train.True_Quantity

In [3]:
df = df_train

df["lsLangLit"]=df["project_subject_categories"].str.contains("Literacy & Language")
df["lsMathSci"]=df["project_subject_categories"].str.contains("Math & Science")
df["lsAppLearn"]=df["project_subject_categories"].str.contains("Applied Learning")
df["lsHealth"]=df["project_subject_categories"].str.contains("Health & Sports")
df["lsSpNeeds"]=df["project_subject_categories"].str.contains("Special Needs")
df["lsHistCiv"]=df["project_subject_categories"].str.contains("History & Civics")
df["lsMusArts"]=df["project_subject_categories"].str.contains("Music & The Arts")
df["lsWarm"]=df["project_subject_categories"].str.contains("Warmth, Care & Hunger")



# Cleaning up variables to get features vs. target(s)# Cleani 
df2 = df # Preserve original dataframe

if 'project_is_approved'in df:
    y1 = df['project_is_approved'] # Creates the approval target 
    del df['project_is_approved']

y2 = df[['lsLangLit','lsMathSci','lsHealth','lsSpNeeds','lsHistCiv','lsMusArts','lsWarm']]# creates a target matrix of the categories

# Remove the target category variables
if 'lsLangLit'in df:
    del df['lsLangLit']
    del df['lsMathSci']
    del df['lsHealth']
    del df['lsSpNeeds']
    del df['lsHistCiv']
    del df['lsMusArts']
    del df['lsWarm']
    
if 'project_subject_categories'in df:
    y3 = df['project_subject_categories'] # Creates the approval target if we're using the full listing and not one-hot encoded 
    del df['project_subject_categories'] # Remove the categories from the training set
    del df['project_subject_subcategories'] # Remove the subcategories, since they are dependent on the categories

In [4]:
# Create a new column that indicates whether the project is reading, writing, or 
# arithmetic (IsLangLit and/or IsMathSci). 

y2.loc[:,'Three_Rs'] = np.logical_or(y2['lsLangLit'],y2['lsMathSci'])
three_r = y2['Three_Rs']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [27]:
# Concatenate our new column to our original dataframe.
# We also are joining all 4 essay columns into a single column.
df = pd.concat((df_train, three_r),axis=1)
df['text'] = df.apply(lambda row: ' '.join([str(row['project_essay_1']), 
                                            str(row['project_essay_2']), 
                                            str(row['project_essay_3']), 
                                            str(row['project_essay_4'])]), axis=1)
train = pd.concat((df,y1), axis=1).sample(frac=0.2,random_state=200)
test = pd.concat((df,y1), axis = 1).sample(frac = 0.2, random_state = 201)

y_proj_train = train['project_is_approved']
y_proj_test = test['project_is_approved']

if 'project_is_approved' in train: del train['project_is_approved']
if 'project_is_approved' in test: del test['project_is_approved']

In [28]:
# Create and fit our Tfidf Vectorizer
vectorizer = TfidfVectorizer()
text = vectorizer.fit_transform(train['text'])

In [29]:
# Grab the Three R's column as our testing variable.

y = train['Three_Rs']

x_test = vectorizer.transform(test['text'])
y_test = test['Three_Rs']

In [34]:
mnb = MultinomialNB()
mnb.fit(text, y)
mnb.predict(x_test)

mnb_proj = MultinomialNB()
mnb_proj.fit(text, y_proj_train)
mnb_proj.predict(x_test)

print("Accuracy of a Multinomial NB classifier on the Three R's column:",
      accuracy_score(mnb.predict(x_test), y_test))

print("Accuracy of a Multinomial NB classifier on the project approval column:",
      accuracy_score(mnb_proj.predict(x_test), y_test))

Accuracy of a Multinomial NB classifier on the Three R's column: 0.8036302724077329
Accuracy of a Multinomial NB classifier on the project approval column: 0.702960237258348
